In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))

Imports

In [2]:
# Importing some shizzle.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import json
import pickle
from utilities import get_header, genepanel_analysis, auc_analysis_function
from utilities import analyze_auc_per_gene, correct_threshold, read_capice_output
from utilities import full_auc_analysis
from sklearn.metrics import recall_score, precision_score, f1_score
from pathlib import Path
import subprocess
from sklearn.metrics import roc_auc_score
from bokeh.palettes import viridis
import time
import math
from scipy.optimize import curve_fit

# Defining some import and export locations
location = 'rjsietsma'
read_loc = '/home/'+location+'/shared/'
data_expor_loc = '/home/'+location+'/Documents/School/Master_DSLS/Final_Thesis/Past_initial_data/'
img_output_dir = '/home/'+location+'/PycharmProjects/dsls_master_thesis/side_scripts/output_img/'

with open('./umcg_genepanels.json', 'r') as panels:
    genepanels = json.load(panels)
    genepanels.pop('5GPM', None)

Creating dyslipid dataset

Index

In [7]:
file_loc = './datafiles/train.txt.gz'
header = get_header(file_loc, '#Chrom')
train = pd.read_csv(file_loc, compression='gzip', names=header, comment='#', sep='\t', low_memory=False)
train
Out[7]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
0 14 False G CodingTranscript False False CCDS9787.1 806.0 False frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.843 False 1.0
1 20 False T CodingTranscript True False CCDS13112.1 1899.0 True frameshift,stop_gained ... NaN NaN vkgl NaN NaN False 1.000 4.670 False 1.0
2 20 False C CodingTranscript True False CCDS13112.1 2118.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.043 False 1.0
3 20 False A CodingTranscript True False CCDS13112.1 1586.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 6.221 False 1.0
4 20 False A Intergenic True False NaN NaN True downstream ... NaN NaN vkgl NaN NaN False 1.000 6.368 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334596 17 False A CodingTranscript False False CCDS32642.1 1563.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 6.031 False 0.8
334597 17 False T CodingTranscript False False CCDS32642.1 2029.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 4.100 False 0.8
334598 10 False T CodingTranscript False False CCDS7431.1 1216.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 5.852 False 0.8
334599 2 False T CodingTranscript False False CCDS2382.1 2998.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.031 2.213 False 0.8
334600 5 False T CodingTranscript False False CCDS3952.1 1221.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 0.528 False 0.8

334601 rows × 152 columns

In [8]:
with open('./umcg_genepanels.json', 'r') as json_file:
    genes = json.load(json_file)
dislipid_genes = genes['Hart- en vaatziekten']
genelist = []
for key, value in dislipid_genes.items():
    if key.lower().startswith('dyslipid'):
        for g in value:
            if g not in genelist:
                genelist.append(g)
In [9]:
dislipid_subset = train.loc[train['GeneName'].isin(genelist)]
dislipid_subset
Out[9]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
142 16 False T CodingTranscript False False CCDS10772.1 848.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.928 2.614 False 1.0
148 19 False CCGGCGAGGTGCAGGCCATGCT CodingTranscript False False CCDS12647.1 409.0 True protein_altering ... NaN NaN vkgl NaN NaN True 0.863 0.839 False 1.0
149 2 False C CodingTranscript False False CCDS1703.1 13028.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.000 0.058 False 1.0
150 2 False G CodingTranscript False False CCDS1703.1 28.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.021 -0.103 False 1.0
151 2 False C CodingTranscript False False CCDS1703.1 2534.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.653 0.251 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334284 12 False T CodingTranscript False False CCDS8685.1 1093.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.999 3.072 False 0.8
334285 12 False A CodingTranscript False False CCDS8685.1 1475.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.580 0.563 False 0.8
334286 12 False T CodingTranscript False False CCDS8685.1 1537.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.003 0.643 False 0.8
334287 12 False G CodingTranscript False False CCDS8685.1 1553.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.510 2.765 False 0.8
334288 12 False G CodingTranscript False False CCDS8685.1 1634.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.000 -0.326 False 0.8

5029 rows × 152 columns

In [10]:
dislipid_subset['label'].value_counts()
Out[10]:
Benign        3913
Pathogenic    1116
Name: label, dtype: int64

Model 1.0 statistics

Index

In [11]:
# model_gcc = pickle.load(open('./test_output/model_2_0/default_hyper/base/xgb_gcc_cluster.pickle.dat', 'rb'))
# model_gcc.attributes()
In [12]:
# model_github = pickle.load(open('./test_output/model_2_0/default_hyper/base/xgb_github.pickle.dat', 'rb'))
# model_github.attributes()
In [13]:
test_gcc = read_capice_output('./test_output/test_gccmodel_0721.txt')
test_git = read_capice_output('./test_output/test_githubmodel_0721.txt')
print(test_gcc.shape[0])
test_gcc = test_gcc.append(test_git)
print(test_gcc.shape[0])
test_gcc.drop_duplicates(inplace=True)
print(test_gcc.shape[0])
10842
21684
10842

2.0 Base models

Index

Threshold analysis

In [14]:
train_base20 = read_capice_output('./test_output/model_2_0/result_files/train_base2_0.txt')
train_base20
Out[14]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt
0 BRCA1 FRAME_SHIFT 26.600 9.999967e-01 Pathogenic Pathogenic 17 41246652 ACATTC GA
1 LDLR FRAME_SHIFT 35.000 9.999961e-01 Pathogenic Pathogenic 19 11216252 GACAAA TTT
2 LDLR FRAME_SHIFT 34.000 9.999925e-01 Pathogenic Pathogenic 19 11216255 AAATCTGACG TGCAA
3 LDLR FRAME_SHIFT 34.000 9.999908e-01 Pathogenic Pathogenic 19 11216262 ACG CA
4 BRCA1 FRAME_SHIFT 27.300 9.999901e-01 Pathogenic Pathogenic 17 41246611 AG A
... ... ... ... ... ... ... ... ... ... ...
334596 MYHAS SYNONYMOUS 6.942 3.544337e-07 Neutral Neutral 17 10404694 G A
334597 LIG4 3PRIME_UTR 1.122 3.442061e-07 Neutral Neutral 13 108860318 C CATT
334598 MIR548AZ NON_SYNONYMOUS 12.180 3.056727e-07 Neutral Neutral 14 64519911 G A
334599 MIR548AZ SYNONYMOUS 15.650 2.615435e-07 Neutral Neutral 14 64692103 G A
334600 MYH2 SYNONYMOUS 5.751 2.509487e-07 Neutral Neutral 17 10429052 G A

334601 rows × 10 columns

In [15]:
# correct_threshold(train_base20)
In [16]:
test_base20 = read_capice_output('./test_output/model_2_0/result_files/test_base2_0.txt')
print("XGBoost 1.1.1, python3.8, threshold 0.02:")
auc_analysis_function(train_base20, test_base20)
XGBoost 1.1.1, python3.8, threshold 0.02:
AUC analysis of the training dataset reveals AUC: 0.9971556578875161
AUC analysis of the testing dataset reveals AUC: 0.8937102839970584
In [17]:
train_base20ct = read_capice_output('./test_output/model_2_0/result_files/train_base_xgboost_correctthres.txt')
test_base20ct = read_capice_output('./test_output/model_2_0/result_files/test_base_xgboost_correctthres.txt')
print("XGboost 1.1.1, python3.8, using threshold 0.149:")
auc_analysis_function(train_base20ct, test_base20ct)
XGboost 1.1.1, python3.8, using threshold 0.149:
AUC analysis of the training dataset reveals AUC: 0.9971556578875161
AUC analysis of the testing dataset reveals AUC: 0.8937102839970584

Why is base 2.0 model not showing a threshold close to 0.02?

Investigating original threshold

In [18]:
train_base = read_capice_output('./test_output/model_2_0/result_files/train_base_xgboost_0721.txt')
train_base
Out[18]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt
0 BRCA1 FRAME_SHIFT 26.600 9.999933e-01 Pathogenic Pathogenic 17 41246652 ACATTC GA
1 LDLR FRAME_SHIFT 35.000 9.999907e-01 Pathogenic Pathogenic 19 11216246 TGCAAGGACAAATCTGAC CCGACTG
2 LDLR FRAME_SHIFT 34.000 9.999896e-01 Pathogenic Pathogenic 19 11216251 GGACAAATCTGACGA AACTGCGGTAAACTGCGGTAAACT
3 LDLR FRAME_SHIFT 34.000 9.999894e-01 Pathogenic Pathogenic 19 11216262 ACG CA
4 MSH2 FRAME_SHIFT 35.000 9.999891e-01 Pathogenic Pathogenic 2 47702328 GTTGA TTTC
... ... ... ... ... ... ... ... ... ... ...
334596 MIR548AZ SYNONYMOUS 14.370 2.765540e-07 Neutral Neutral 14 64653189 T C
334597 MYHAS SYNONYMOUS 0.611 2.524258e-07 Neutral Neutral 17 10419945 A G
334598 MYHAS SYNONYMOUS 9.827 2.164549e-07 Neutral Neutral 17 10419849 T G
334599 LOC100289580 SYNONYMOUS 14.100 1.984229e-07 Neutral Neutral 16 88804658 G A
334600 MYHAS SYNONYMOUS 10.040 1.227600e-07 Neutral Neutral 17 10419849 T C

334601 rows × 10 columns

In [19]:
# train_base_recall, train_base_threshold = correct_threshold(train_base)
In [20]:
train_in = pd.read_csv(
     './datafiles/train.txt.gz',
      compression='gzip', sep='\t', low_memory=False)
data = train_base.merge(
     train_in[['#Chrom', 'Pos', 'Ref', 'Alt', 'label']],
     left_on=['chr', 'pos', 'ref', 'alt'],
     right_on=['#Chrom', 'Pos', 'Ref', 'Alt'])
drop_labels = ['#Chrom', 'Pos', 'Ref', 'Alt']
for x in data.columns:
    if x.endswith('_x') or x.endswith('_y'):
        drop_labels.append(x)
data.drop(columns=drop_labels, inplace=True)
data
Out[20]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt label
0 BRCA1 FRAME_SHIFT 26.600 9.999933e-01 Pathogenic Pathogenic 17 41246652 ACATTC GA Pathogenic
1 LDLR FRAME_SHIFT 35.000 9.999907e-01 Pathogenic Pathogenic 19 11216246 TGCAAGGACAAATCTGAC CCGACTG Pathogenic
2 LDLR FRAME_SHIFT 34.000 9.999896e-01 Pathogenic Pathogenic 19 11216251 GGACAAATCTGACGA AACTGCGGTAAACTGCGGTAAACT Pathogenic
3 LDLR FRAME_SHIFT 34.000 9.999894e-01 Pathogenic Pathogenic 19 11216262 ACG CA Pathogenic
4 MSH2 FRAME_SHIFT 35.000 9.999891e-01 Pathogenic Pathogenic 2 47702328 GTTGA TTTC Pathogenic
... ... ... ... ... ... ... ... ... ... ... ...
334596 MIR548AZ SYNONYMOUS 14.370 2.765540e-07 Neutral Neutral 14 64653189 T C Benign
334597 MYHAS SYNONYMOUS 0.611 2.524258e-07 Neutral Neutral 17 10419945 A G Benign
334598 MYHAS SYNONYMOUS 9.827 2.164549e-07 Neutral Neutral 17 10419849 T G Benign
334599 LOC100289580 SYNONYMOUS 14.100 1.984229e-07 Neutral Neutral 16 88804658 G A Benign
334600 MYHAS SYNONYMOUS 10.040 1.227600e-07 Neutral Neutral 17 10419849 T C Benign

334601 rows × 11 columns

In [21]:
def default_threshold(row):
    return_value = 0
    if row > 0.02:
        return_value = 1
    return return_value
In [22]:
data['pred'] = data['probabilities'].apply(lambda x: default_threshold(x))
In [23]:
data['label'].replace({'Pathogenic': 1, 'Benign': 0}, inplace=True)
In [24]:
y_true = np.array(data['label'])
y_pred = np.array(data['pred'])
print(f"The recall score of using Python3.8 and threshold 0.02 is: {recall_score(y_true, y_pred)}")
print(f"The precision score of using Python3.8 and threshold 0.02 is: {precision_score(y_true, y_pred)}")
print(f"Resulting in a F1 score of: {f1_score(y_true, y_pred)}")
The recall score of using Python3.8 and threshold 0.02 is: 0.9893316290160026
The precision score of using Python3.8 and threshold 0.02 is: 0.7102746011577016
Resulting in a F1 score of: 0.8268940366736864
In [25]:
test_base = read_capice_output('./test_output/model_2_0/result_files/test_base_xgboost_0721.txt')
test_base
Out[25]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt
0 MSH2 FRAME_SHIFT 34.000 0.999988 Pathogenic Pathogenic 2 47639582 AGAAA TAAT
1 BRCA1 FRAME_SHIFT 22.000 0.999986 Pathogenic Pathogenic 17 41245330 CTTTA TTT
2 BRCA1 STOP_GAINED 34.000 0.999983 Pathogenic Pathogenic 17 41243705 CTGAG GCCT
3 BRCA2 FRAME_SHIFT 17.090 0.999981 Pathogenic Pathogenic 13 32906819 GG G
4 BRCA2 FRAME_SHIFT 22.300 0.999981 Pathogenic Pathogenic 13 32913009 TC T
... ... ... ... ... ... ... ... ... ... ...
10837 MIR548AZ NON_SYNONYMOUS 5.220 0.000002 Neutral Neutral 14 64593063 G A
10838 LOC100289580 UPSTREAM 1.856 0.000002 Neutral Neutral 16 88798720 G A
10839 MYHAS DOWNSTREAM 0.035 0.000002 Neutral Neutral 17 10442488 G A
10840 MIR548AZ NON_SYNONYMOUS 1.347 0.000001 Neutral Neutral 14 64519932 A G
10841 LOC100289580 UPSTREAM 0.168 0.000001 Neutral Neutral 16 88801032 G T

10842 rows × 10 columns

In [26]:
print('Using XGboost 0.72.1 on python3.8, default threshold (0.02):')
auc_analysis_function(train_base, test_base)
Using XGboost 0.72.1 on python3.8, default threshold (0.02):
AUC analysis of the training dataset reveals AUC: 0.9971523753640674
AUC analysis of the testing dataset reveals AUC: 0.893641835924645
AUC analysis, but now with a new threshold.
In [27]:
train_basect = read_capice_output('./test_output/model_2_0/result_files/train_base_xgboost_0721_correctthres.txt')
test_basect = read_capice_output('./test_output/model_2_0/result_files/test_base_xgboost_0721_correctthres.txt')
print("Using XGboost 0.72.1 on python3.8, threshold set to 0.152:")
auc_analysis_function(train_basect, test_basect)
Using XGboost 0.72.1 on python3.8, threshold set to 0.152:
AUC analysis of the training dataset reveals AUC: 0.9971523753640674
AUC analysis of the testing dataset reveals AUC: 0.893641835924645

Actual correct 2.0 model without bugs.

Index

In [28]:
default_hyper_version111_model = pickle.load(open('./test_output/model_2_0/default_hyper/base/unbalanced/xgb_defaulthyper_111.pickle.dat', 'rb'))
default_hyper_version111_model
Out[28]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.10495845238185281, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=422, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)
In [29]:
train_forsure_default = read_capice_output('./test_output/model_2_0/default_hyper/result_files/train_defaulthyper_nothres.txt')
test_forsure_default = read_capice_output('./test_output/model_2_0/default_hyper/result_files/test_defaulthyper_nothres.txt')
print('Model 2.0, default hyperparameters, xgboost 1.1.1, python3.7, threshold 0.02:')
auc_analysis_function(train_forsure_default, test_forsure_default)
Model 2.0, default hyperparameters, xgboost 1.1.1, python3.7, threshold 0.02:
AUC analysis of the training dataset reveals AUC: 0.9971556579293329
AUC analysis of the testing dataset reveals AUC: 0.893710301011245
In [30]:
# correct_threshold(train_results=train_forsure_default, include_upper=True)
In [31]:
test_forsure_default
Out[31]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt
0 BRCA1 STOP_GAINED 34.000 0.999991 Pathogenic Pathogenic 17 41243705 CTGAG GCCT
1 BRCA2 FRAME_SHIFT 26.900 0.999990 Pathogenic Pathogenic 13 32912883 CTG TT
2 BRCA2 FRAME_SHIFT 24.100 0.999989 Pathogenic Pathogenic 13 32914822 TAA T
3 BRCA2 FRAME_SHIFT 25.600 0.999989 Pathogenic Pathogenic 13 32913442 TC T
4 BRCA1 FRAME_SHIFT 19.390 0.999988 Pathogenic Pathogenic 17 41246507 CAG C
... ... ... ... ... ... ... ... ... ... ...
10837 SYNE1 SPLICE_SITE 14.930 0.000002 Neutral Neutral 6 152722482 T C
10838 NBPF20 REGULATORY 3.320 0.000002 Neutral Neutral 1 145439717 C G
10839 NBPF20 REGULATORY 3.306 0.000001 Neutral Neutral 1 145440240 C T
10840 MIR548AZ NON_SYNONYMOUS 5.553 0.000001 Neutral Neutral 14 64537567 G C
10841 MIR548AZ NON_SYNONYMOUS 1.347 0.000001 Neutral Neutral 14 64519932 A G

10842 rows × 10 columns

In [32]:
train_forsure_oldmodel = read_capice_output('./test_output/model_2_0/default_hyper/result_files/train_xgboost0721_defaulthyper.txt')
test_forsure_oldmodel = read_capice_output('./test_output/model_2_0/default_hyper/result_files/test_xgboost0721_defaulthyper.txt')
print('Model 1.0, default hyperparameters, xgboost 0.72.1, python3.8, threshold 0.02, forced xgboost install:')
auc_analysis_function(train_forsure_oldmodel, test_forsure_oldmodel)
Model 1.0, default hyperparameters, xgboost 0.72.1, python3.8, threshold 0.02, forced xgboost install:
AUC analysis of the training dataset reveals AUC: 0.9971523753640674
AUC analysis of the testing dataset reveals AUC: 0.893641835924645

Original model

Index

In [33]:
train_python36 = read_capice_output('./test_output/model_2_0/result_files/train_python36_defaultthres.txt')
test_python36 = read_capice_output('./test_output/model_2_0/result_files/test_python36_defaultthres.txt')
print("Using python3.6, xgboost 0.72.1, threshold default:")
auc_analysis_function(train_python36, test_python36)
Using python3.6, xgboost 0.72.1, threshold default:
AUC analysis of the training dataset reveals AUC: 0.9971523753640674
AUC analysis of the testing dataset reveals AUC: 0.893641835924645

Confirmed: The 2.0 model performs just as good as model 1.0

CV Models

Index

In [34]:
model = pickle.load(open('./models/xgb_weightedSample_randomsearch_v2.pickle.dat', 'rb'))
xgbmodel = model.best_estimator_
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator RandomizedSearchCV from version 0.23.1 when using version 0.23.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
In [35]:
model_dislipid = pickle.load(open('./models/xgb_weightedSample_randomsearch_dislipid.pickle.dat', 'rb'))
xgbmodel_dislipid = model_dislipid.best_estimator_
dir(xgbmodel_dislipid._Booster)
Out[35]:
['__class__',
 '__copy__',
 '__deepcopy__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_validate_features',
 'attr',
 'attributes',
 'best_iteration',
 'best_ntree_limit',
 'best_score',
 'boost',
 'booster',
 'copy',
 'dump_model',
 'eval',
 'eval_set',
 'feature_names',
 'feature_types',
 'get_dump',
 'get_fscore',
 'get_score',
 'get_split_value_histogram',
 'handle',
 'inplace_predict',
 'load_config',
 'load_model',
 'load_rabit_checkpoint',
 'predict',
 'save_config',
 'save_model',
 'save_rabit_checkpoint',
 'save_raw',
 'set_attr',
 'set_param',
 'trees_to_dataframe',
 'update']
In [36]:
model_ek = pickle.load(open('./models/xgb_ransearch_ek_dataset.pickle.dat', 'rb'))

New Models

Index

In [37]:
model = pickle.load(open('./xgbmodels/xgb_booster_v2.pickle.dat', 'rb'))
model.feature_importances_
Out[37]:
array([0.00282909, 0.00207738, 0.00183909, 0.00320904, 0.00281486,
       0.00190347, 0.00192447, 0.00191927, 0.00175739, 0.00351866,
       0.00253019, 0.00301028, 0.00308079, 0.00301589, 0.00242207,
       0.00284953, 0.00254809, 0.00301885, 0.00276217, 0.00329895,
       0.0029472 , 0.00288387, 0.00196225, 0.00324255, 0.00249287,
       0.00308638, 0.00275804, 0.00326703, 0.003321  , 0.02822421,
       0.01098935, 0.00849544, 0.0026135 , 0.0021958 , 0.00212344,
       0.02957111, 0.00307086, 0.00227875, 0.00224516, 0.00332749,
       0.00291972, 0.00251198, 0.00405553, 0.00422445, 0.00319547,
       0.00250752, 0.00358728, 0.0023705 , 0.00314594, 0.00369558,
       0.00300821, 0.00350186, 0.00383268, 0.00286349, 0.00239821,
       0.00318689, 0.00370395, 0.00359068, 0.00305595, 0.01804393,
       0.00381216, 0.00197791, 0.0018304 , 0.0023901 , 0.00244684,
       0.00294106, 0.00317492, 0.00303257, 0.00199907, 0.00196537,
       0.00232091, 0.00312337, 0.00172605, 0.00178056, 0.        ,
       0.00235593, 0.00206226, 0.00217503, 0.00434332, 0.0020939 ,
       0.00449965, 0.01930718, 0.0018447 , 0.00274791, 0.00609136,
       0.00580421, 0.00188665, 0.00931865, 0.00234432, 0.00315913,
       0.00402672, 0.00646648, 0.00910742, 0.00322477, 0.00176483,
       0.00343788, 0.00176548, 0.07158165, 0.0033759 , 0.        ,
       0.00535233, 0.0063484 , 0.00330518, 0.00307293, 0.00348948,
       0.00726572, 0.01224917, 0.01062934, 0.02123239, 0.00196283,
       0.00444081, 0.00216935, 0.00285928, 0.0026971 , 0.00244203,
       0.00233031, 0.14946337, 0.02471637, 0.02496736, 0.01712855,
       0.00207521, 0.00310974, 0.00341634, 0.16352823, 0.00382764,
       0.01300227, 0.00745142, 0.01435524, 0.01285369, 0.00542477,
       0.00070084], dtype=float32)
In [38]:
model_dislipid = pickle.load(open('./xgbmodels/xgb_booster_dyslipid.pickle.dat', 'rb'))
model_dislipid.feature_importances_
Out[38]:
array([0.00278866, 0.00208772, 0.00174322, 0.00296931, 0.00269667,
       0.00179555, 0.00186813, 0.00186437, 0.00175499, 0.00395201,
       0.00240749, 0.00280701, 0.00290935, 0.00285559, 0.0023123 ,
       0.0027687 , 0.0024073 , 0.00306501, 0.00286279, 0.00350749,
       0.00261141, 0.00282689, 0.00188535, 0.00320358, 0.00238541,
       0.00293148, 0.00261692, 0.00331071, 0.00368271, 0.02656523,
       0.01037843, 0.00846208, 0.00255091, 0.00212624, 0.00206606,
       0.03021989, 0.0027948 , 0.00225078, 0.00219586, 0.00343691,
       0.00298393, 0.00254942, 0.00387159, 0.00394147, 0.00327632,
       0.00257063, 0.00359947, 0.00250112, 0.00339626, 0.00355939,
       0.0030033 , 0.00344496, 0.00338445, 0.00294264, 0.00213853,
       0.00313247, 0.00377457, 0.00349594, 0.00298174, 0.01875002,
       0.00379864, 0.00185824, 0.00179364, 0.00239529, 0.00231335,
       0.00317147, 0.00292523, 0.00316273, 0.00262851, 0.00228997,
       0.00601442, 0.00521736, 0.00169761, 0.00183883, 0.        ,
       0.00234997, 0.00158524, 0.00213381, 0.00315469, 0.00248724,
       0.00430137, 0.01801617, 0.0018445 , 0.00280421, 0.00624151,
       0.00602516, 0.00171275, 0.00894723, 0.00171685, 0.00311938,
       0.00349789, 0.00585964, 0.00859777, 0.00276252, 0.00209822,
       0.00323153, 0.00161626, 0.06846787, 0.0036844 , 0.        ,
       0.00444979, 0.00538614, 0.00277188, 0.00351531, 0.00360888,
       0.00591644, 0.01264253, 0.00823699, 0.0243496 , 0.00018329,
       0.00428347, 0.00217733, 0.00302215, 0.0026641 , 0.00253335,
       0.00206614, 0.16421454, 0.02516109, 0.02144627, 0.02164953,
       0.00223931, 0.00240782, 0.00254637, 0.16032705, 0.00204109,
       0.01342087, 0.00806119, 0.01339926, 0.01314391, 0.005304  ,
       0.00024527], dtype=float32)
In [39]:
model_ek = model_ek.best_estimator_
model_ek.feature_importances_
Out[39]:
array([5.0133662e-03, 1.2882849e-03, 1.0817926e-03, 6.3643702e-03,
       2.4500915e-03, 1.2766811e-03, 1.6910619e-03, 1.9211929e-03,
       1.3221180e-03, 1.4542753e-03, 1.7586007e-03, 3.7318198e-03,
       1.4746373e-03, 2.0313114e-03, 1.3998400e-03, 1.0245102e-03,
       1.8737183e-03, 1.2974218e-03, 1.8016534e-03, 1.4174232e-03,
       1.7281594e-03, 2.0112914e-03, 1.3021030e-03, 1.5508684e-03,
       2.8008316e-03, 1.2582513e-03, 1.9976879e-03, 2.1160368e-03,
       2.7596293e-02, 4.1479445e-03, 1.9538682e-02, 1.1834032e-02,
       1.5302061e-03, 1.3707022e-03, 1.8456797e-03, 2.6172997e-02,
       2.8442349e-03, 1.3298710e-03, 1.2373156e-03, 1.3766757e-03,
       1.9199389e-03, 1.9936443e-03, 4.2407308e-03, 2.6613504e-03,
       4.2698625e-03, 1.5507338e-03, 2.0464479e-03, 1.2644089e-03,
       1.8295237e-03, 2.7150142e-03, 2.2141868e-03, 4.2772172e-03,
       4.4871587e-03, 1.0170736e-03, 1.9717950e-03, 1.3658322e-03,
       1.7440096e-03, 1.4196830e-03, 1.5646445e-03, 1.7786717e-02,
       7.2565661e-03, 1.2215808e-03, 1.0896001e-03, 1.2801936e-03,
       1.6787732e-03, 1.3853315e-03, 3.8878783e-03, 3.6497233e-03,
       2.5625152e-03, 6.4594147e-04, 0.0000000e+00, 2.6741764e-04,
       1.0275145e-03, 9.8351133e-04, 0.0000000e+00, 1.8497148e-03,
       1.3164843e-03, 2.0394444e-03, 7.7678368e-04, 1.4666696e-04,
       1.8183066e-03, 5.4495376e-02, 9.8599296e-04, 2.5706911e-03,
       1.5066749e-03, 1.3026128e-03, 0.0000000e+00, 3.9810329e-03,
       2.6930047e-02, 1.4247919e-03, 0.0000000e+00, 0.0000000e+00,
       3.4228545e-03, 2.5650777e-03, 1.7217444e-03, 1.3366465e-03,
       1.2302867e-03, 2.8391546e-03, 2.0883349e-03, 0.0000000e+00,
       1.6282888e-03, 8.7434653e-04, 1.1597392e-03, 3.0243149e-04,
       1.6077551e-03, 8.6761132e-04, 5.9726797e-03, 7.2044302e-03,
       2.3137821e-02, 0.0000000e+00, 6.6684320e-04, 8.8741980e-04,
       1.2830134e-03, 4.4335630e-03, 2.0581577e-03, 8.1075827e-04,
       1.6909115e-02, 6.6173850e-03, 2.2486884e-03, 1.1616969e-01,
       3.9777033e-02, 2.2247344e-02, 3.0404744e-01, 6.3085511e-02,
       7.9946109e-04, 6.1545370e-04, 1.0288571e-03, 8.8923089e-03,
       3.8198836e-04, 1.3955535e-03, 0.0000000e+00], dtype=float32)

Mismatch analysis

Index

In [40]:
file = './datafiles/train_results.txt.gz'
if os.path.isfile(file):
    train_original = pd.read_csv(file, sep='\t', low_memory=False)
else:
    train_original = pd.DataFrame(columns=['chr', 'pos', 'ref', 'alt', 'prediction'])
train_original
Out[40]:
chr pos ref alt prediction
In [41]:
train_new = read_capice_output('./datafiles/train_results_v4.txt.gz')
train_new
Out[41]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt
0 LDLR FRAME_SHIFT 35.000 9.999982e-01 Pathogenic Pathogenic 19 11216252 GACAAA TTT
1 BRCA2 FRAME_SHIFT 35.000 9.999978e-01 Pathogenic Pathogenic 13 32936775 GAATTT AG
2 LDLR FRAME_SHIFT 35.000 9.999976e-01 Pathogenic Pathogenic 19 11216246 TGCAAGGACAAATCTGAC CCGACTG
3 LDLR FRAME_SHIFT 34.000 9.999967e-01 Pathogenic Pathogenic 19 11216262 ACG CA
4 BRCA2 FRAME_SHIFT 20.800 9.999956e-01 Pathogenic Pathogenic 13 32906847 TACCCCTATTG ACAT
... ... ... ... ... ... ... ... ... ... ...
334596 MYHAS SYNONYMOUS 10.130 7.769997e-08 Neutral Neutral 17 10429043 G A
334597 MYHAS SYNONYMOUS 15.280 7.578360e-08 Neutral Neutral 17 10415744 A G
334598 MIR1273H SYNONYMOUS 13.040 6.680243e-08 Neutral Neutral 4 39448569 G A
334599 MYHAS SYNONYMOUS 12.770 5.909591e-08 Neutral Neutral 17 10318882 G A
334600 MYH8 SPLICE_SITE 2.474 4.037909e-08 Neutral Neutral 17 10318897 G A

334601 rows × 10 columns

In [42]:
merge = train_original[['chr', 'pos','ref','alt','prediction']].merge(train_new[['chr', 'pos','ref','alt','prediction']],
                                                                     on=['chr', 'pos','ref','alt'])
merge[merge['prediction_x'] != merge['prediction_y']]
if train_original.shape[0] > 0:
    print(f"There is a "
          f"{merge[merge['prediction_x'] != merge['prediction_y']].shape[0] / train_original.shape[0] * 100}% mismatch.")
In [43]:
file = './datafiles/test_results.txt'
if os.path.isfile(file):
    test_original = pd.read_csv(file, sep='\t', low_memory=False)
    tellPathogenic_pred = lambda x: "Pathogenic" if x > 0.02 else 'Neutral'
    test_original['prediction'] = [tellPathogenic_pred(probability) for probability in test_original['capice']]
    test_original.rename(columns={'#Chrom': 'chr', 'Pos':'pos', 'Ref': 'ref', 'Alt':'alt'}, inplace=True)
else:
    test_original = pd.DataFrame(columns=['chr', 'pos', 'ref', 'alt', 'prediction'])
test_original
Out[43]:
chr pos ref alt prediction
In [44]:
test_new = read_capice_output('./datafiles/test_results_v4.txt.gz')
test_new
Out[44]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt
0 BRCA1 STOP_GAINED 34.000 9.999962e-01 Pathogenic Pathogenic 17 41243705 CTGAG GCCT
1 BRCA2 FRAME_SHIFT 26.900 9.999943e-01 Pathogenic Pathogenic 13 32912883 CTG TT
2 MSH2 FRAME_SHIFT 34.000 9.999934e-01 Pathogenic Pathogenic 2 47639582 AGAAA TAAT
3 BRCA2 FRAME_SHIFT 24.100 9.999933e-01 Pathogenic Pathogenic 13 32912590 TA T
4 BRCA2 FRAME_SHIFT 25.600 9.999924e-01 Pathogenic Pathogenic 13 32913442 TC T
... ... ... ... ... ... ... ... ... ... ...
10837 MIR548AZ NON_SYNONYMOUS 5.553 1.214339e-06 Neutral Neutral 14 64537567 G C
10838 MYHAS INTRONIC 0.068 8.699849e-07 Neutral Neutral 17 10432854 G A
10839 NBPF20 REGULATORY 3.320 6.732982e-07 Neutral Neutral 1 145439717 C G
10840 MYHAS DOWNSTREAM 0.035 6.130708e-07 Neutral Neutral 17 10442488 G A
10841 MIR548AZ NON_SYNONYMOUS 1.347 4.053531e-07 Neutral Neutral 14 64519932 A G

10842 rows × 10 columns

In [45]:
merge = test_original[['chr', 'pos','ref','alt','prediction']].merge(test_new[['chr', 'pos','ref','alt','prediction']],
                                                                     on=['chr', 'pos','ref','alt'])
merge[merge['prediction_x'] != merge['prediction_y']]
if test_original.shape[0] > 0:
    print(f"There is a "
          f"{merge[merge['prediction_x'] != merge['prediction_y']].shape[0] / test_original.shape[0] * 100}% mismatch.")
    

Base | Default hyper | model 1.0 | balanced ds?

(Back)

In [2]:
full_auc_analysis(
    curr_setup = 'Base, Default Hyper, model 1.0, balanced ds',
    train_loc = './test_output/train_model1.txt',
    test_loc = './test_output/test_model1.txt',
    auc_analysis_name= 'auc_analysis_model1.csv',
    training_set_loc='./datafiles/train.txt.gz',
    filter_out='./datafiles/train.txt.gz'
)

# Note, Angioedema is not shown in here, 
# since test.txt.gz only contains 3 pathogenic variants for this panel.

#TODO: First calculate AUC of training set, then filter out.
There are 334601 samples in the training set.
AUC analysis of the testing dataset reveals AUC: 0.893641835924645
File ./not_saving_directory/auc_analysis_model1.csv found. Loading.
Top 10 worst performing genes: 
       gene  auc        f1  recall  fpr  precision  n_benign  n_malign  n_tot  \
320   NLRC4  0.0  0.000000     0.0  1.0   0.000000         1         1      2   
180   SALL4  0.0  0.666667     1.0  0.0   0.500000         1         1      2   
313     FUS  0.0  0.000000     0.0  1.0   0.000000         1         1      2   
176    TLK2  0.0  0.666667     1.0  0.0   0.500000         1         1      2   
175  GABRG2  0.0  0.500000     0.5  0.5   0.500000         1         2      3   
166    CHD8  0.0  0.400000     1.0  0.0   0.250000         3         1      4   
48      IDS  0.0  0.666667     1.0  0.0   0.500000         1         1      2   
54    BMPR2  0.0  0.500000     1.0  0.0   0.333333         2         1      3   
150    SDHC  0.0  0.400000     1.0  0.0   0.250000         3         1      4   
309  SPTLC2  0.0  0.000000     0.0  1.0   0.000000         1         1      2   

     n_train  n_test  
320        0       2  
180        0       2  
313        0       2  
176        0       2  
175        0       3  
166        0       4  
48         0       2  
54         0       3  
150        0       4  
309        0       2  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:496: UserWarning: Category Angioedema did not contain enough datapoints for Mann-Whitney analysis!
  warnings.warn(f"Category {category} did not contain enough datapoints for Mann-Whitney analysis!")
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.846358  0.580918  0.423179        all               Neurogenetics   
1    0.243375  0.121688  0.885307        all                 Amyloidosis   
2    0.386984  0.809135  0.193492        all              Cardiovascular   
3    0.367164  0.819845  0.183582        all    Primary Immunodeficiency   
4    0.413667  0.206833  0.796853        all                        Skin   
5    0.508535  0.749195  0.254267        all                    Epilepsy   
6         NaN       NaN       NaN        all                  Angioedema   
7    0.310351  0.155176  0.849067        all                   Metabolic   
8    0.804553  0.611339  0.402277        all    Hyper-/ hypophosphatemia   
9    0.750362  0.375181  0.638148        all                Mitochondria   
10   0.733636  0.640919  0.366818        all               Preconception   
11   0.577635  0.288818  0.718188        all    Congenital heart defects   
12   0.273890  0.136945  0.864913        all           Hereditary cancer   
13   0.915540  0.457770  0.556225        all  Early onset cardiomyopathy   
14   0.915540  0.556225  0.457770        all             Noonan syndrome   
15   0.671409  0.677086  0.335704        all  Primary ciliary dyskinesia   
16   0.971800  0.528183  0.485900        all     Developmental disorders   
17   0.513113  0.256556  0.754698        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho  n_tot  \
0   0.697794  0.120754               Neurogenetics    23659    6110  29769   
1   0.600000  0.000000                 Amyloidosis      367      23    390   
2   0.657371  0.165089              Cardiovascular    29144    5914  35058   
3   0.561431  0.044030    Primary Immunodeficiency    14028    2931  16959   
4   0.643665  0.117672                        Skin     8947    1714  10661   
5   0.747215  0.093287                    Epilepsy    14220    4052  18272   
6        NaN       NaN                  Angioedema       66      15     81   
7   0.599708  0.242118                   Metabolic    44555   15463  60018   
8   0.750000  0.000000    Hyper-/ hypophosphatemia      579     478   1057   
9   0.682984  0.000000                Mitochondria     6705    1762   8467   
10  0.750000  0.250000               Preconception      643     471   1114   
11  0.694752  0.021880    Congenital heart defects    23106    6384  29490   
12  0.696488  0.137958           Hereditary cancer    16869    9907  26776   
13  0.712184  0.000000  Early onset cardiomyopathy    30999   10073  41072   
14  0.734956  0.000000             Noonan syndrome     2524    1172   3696   
15  0.777778  0.000000  Primary ciliary dyskinesia     1534     265   1799   
16  0.732661  0.000000     Developmental disorders    59089   17262  76351   
17  0.666667  0.000000           Leukemia-Lymphoma     1227     237   1464   

   n_train  
0    29769  
1      390  
2    35058  
3    16959  
4    10661  
5    18272  
6       81  
7    60018  
8     1057  
9     8467  
10    1114  
11   29490  
12   26776  
13   41072  
14    3696  
15    1799  
16   76351  
17    1464  
The mean of the M-W analysis AUC: 0.6885678105581117

Base | Default hyper | unbalanced ds

(Back)

In [3]:
full_auc_analysis(
    curr_setup = 'Base, Default Hyper, model 2.0, unbalanced ds',
    train_loc = './test_output/model_2_0/default_hyper/base/unbalanced/base_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/base/unbalanced/base_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_bdhud.csv',
    training_set_loc = './datafiles/train.txt.gz',
    filter_out='./datafiles/train.txt.gz'
)

# Note, Angioedema is not shown in here, 
# since test.txt.gz only contains 3 pathogenic variants for this panel.
There are 334601 samples in the training set.
AUC analysis of the testing dataset reveals AUC: 0.893710301011245
File ./not_saving_directory/auc_analysis_bdhud.csv found. Loading.
Top 10 worst performing genes: 
       gene  auc        f1  recall  fpr  precision  n_benign  n_patho  n_tot  \
320   NLRC4  0.0  0.000000     0.0  1.0   0.000000         1        1      2   
259    SMC3  0.0  0.000000     0.0  1.0   0.000000         4        1      5   
205     ELN  0.0  0.571429     1.0  0.0   0.400000         3        2      5   
200  GABRG2  0.0  0.500000     0.5  0.5   0.500000         1        2      3   
34    FGFR3  0.0  0.857143     1.0  0.0   0.750000         1        3      4   
187   NAA15  0.0  0.400000     1.0  0.0   0.250000         3        1      4   
314    TBX4  0.0  0.000000     0.0  1.0   0.000000         1        1      2   
48      IDS  0.0  0.666667     1.0  0.0   0.500000         1        1      2   
218   NFKB2  0.0  0.500000     1.0  0.0   0.333333         2        1      3   
175    MYCN  0.0  0.666667     1.0  0.0   0.500000         1        1      2   

     n_train  n_test  
320        0       2  
259        0       5  
205        0       5  
200        0       3  
34         0       4  
187        0       4  
314        0       2  
48         0       2  
218        0       3  
175        0       2  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:496: UserWarning: Category Angioedema did not contain enough datapoints for Mann-Whitney analysis!
  warnings.warn(f"Category {category} did not contain enough datapoints for Mann-Whitney analysis!")
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.499294  0.753667  0.249647        all               Neurogenetics   
1    0.167967  0.083983  0.921336        all                 Amyloidosis   
2    0.519543  0.743335  0.259771        all              Cardiovascular   
3    0.432420  0.787574  0.216210        all    Primary Immunodeficiency   
4    0.487542  0.243771  0.760279        all                        Skin   
5    0.987029  0.493514  0.510809        all                    Epilepsy   
6         NaN       NaN       NaN        all                  Angioedema   
7    0.355095  0.177548  0.827085        all                   Metabolic   
8    0.804544  0.611344  0.402272        all    Hyper-/ hypophosphatemia   
9    0.376791  0.188396  0.820999        all                Mitochondria   
10   0.403547  0.803981  0.201773        all               Preconception   
11   0.470376  0.235188  0.771105        all    Congenital heart defects   
12   0.074435  0.037218  0.963469        all           Hereditary cancer   
13   0.971798  0.485899  0.528184        all  Early onset cardiomyopathy   
14   0.915536  0.556228  0.457768        all             Noonan syndrome   
15   0.645812  0.689679  0.322906        all  Primary ciliary dyskinesia   
16   0.887545  0.570153  0.443772        all     Developmental disorders   
17   0.468614  0.776399  0.234307        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho  n_tot  \
0   0.725717  0.116495               Neurogenetics    23659    6110  29769   
1   0.500000  0.000000                 Amyloidosis      367      23    390   
2   0.665865  0.162141              Cardiovascular    29144    5914  35058   
3   0.535999  0.050910    Primary Immunodeficiency    14028    2931  16959   
4   0.657091  0.125063                        Skin     8947    1714  10661   
5   0.709276  0.060229                    Epilepsy    14220    4052  18272   
6        NaN       NaN                  Angioedema       66      15     81   
7   0.636198  0.190424                   Metabolic    44555   15463  60018   
8   0.750000  0.000000    Hyper-/ hypophosphatemia      579     478   1057   
9   0.638861  0.000000                Mitochondria     6705    1762   8467   
10  0.833333  0.166667               Preconception      643     471   1114   
11  0.685139  0.016697    Congenital heart defects    23106    6384  29490   
12  0.671380  0.146518           Hereditary cancer    16869    9907  26776   
13  0.716755  0.000000  Early onset cardiomyopathy    30999   10073  41072   
14  0.733386  0.000000             Noonan syndrome     2524    1172   3696   
15  0.777778  0.000000  Primary ciliary dyskinesia     1534     265   1799   
16  0.736443  0.000000     Developmental disorders    59089   17262  76351   
17  0.833333  0.000000           Leukemia-Lymphoma     1227     237   1464   

   n_train  
0    29769  
1      390  
2    35058  
3    16959  
4    10661  
5    18272  
6       81  
7    60018  
8     1057  
9     8467  
10    1114  
11   29490  
12   26776  
13   41072  
14    3696  
15    1799  
16   76351  
17    1464  
The mean of the M-W analysis AUC: 0.6945031911186837

Base | Default hyper | balanced ds

(Back)

In [2]:
full_auc_analysis(
    curr_setup = 'Base, Default Hyper, model 2.0, balanced ds',
    train_loc = './test_output/model_2_0/default_hyper/base/balanced/base_balanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/base/balanced/base_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_bdhbd.csv',
    training_set_loc = './test_output/model_2_0/default_hyper/base/balanced/train_balanced_dataset.tsv.gz',
    filter_out= './test_output/model_2_0/default_hyper/base/balanced/train_balanced_dataset.tsv.gz'
)
There are 51882 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.9300276293445563
AUC analysis of the testing dataset reveals AUC: 0.9394584540957758
File ./not_saving_directory/auc_analysis_bdhbd.csv found. Loading.
Top 10 worst performing genes: 
          gene  auc        f1  recall  fpr  precision  n_benign  n_malign  \
1694      LIPE  0.0  0.500000     1.0  0.0   0.333333         2         1   
1668    MRPS34  0.0  0.666667     1.0  0.0   0.500000         1         1   
1469     TCF20  0.0  0.923077     1.0  0.0   0.857143         1         6   
1658      SMPX  0.0  0.000000     0.0  1.0   0.000000         2         1   
823       PIGW  0.0  0.666667     1.0  0.0   0.500000         1         1   
354       SORD  0.0  0.500000     1.0  0.0   0.333333         2         1   
920     SLC6A9  0.0  0.500000     1.0  0.0   0.333333         2         1   
964       PURA  0.0  0.181818     1.0  0.0   0.100000         9         1   
1584     MEIS2  0.0  0.333333     1.0  0.0   0.200000         4         1   
1386  TRAPPC12  0.0  0.333333     1.0  0.0   0.200000         4         1   

      n_tot  n_train  n_test  
1694      3        3       0  
1668      2        2       0  
1469      7        5       2  
1658      3        2       1  
823       2        2       0  
354       3        2       1  
920       3        3       0  
964      10       10       0  
1584      5        4       1  
1386      5        5       0  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.113764  0.944305  0.056882        all               Neurogenetics   
1    0.436748  0.218374  0.791904        all                 Amyloidosis   
2    0.000929  0.000464  0.999551        all              Cardiovascular   
3    0.147978  0.927812  0.073989        all    Primary Immunodeficiency   
4    0.520703  0.260352  0.743842        all                        Skin   
5    0.629612  0.314806  0.689033        all                    Epilepsy   
6    0.547875  0.737708  0.273938        all                  Angioedema   
7    0.535628  0.738066  0.267814        all                   Metabolic   
8    0.305295  0.855540  0.152647        all    Hyper-/ hypophosphatemia   
9    0.376834  0.820977  0.188417        all                Mitochondria   
10   0.086951  0.958392  0.043476        all                   Fertility   
11   0.079611  0.039805  0.961933        all    Congenital heart defects   
12   0.378187  0.813202  0.189093        all           Hereditary cancer   
13   0.723715  0.651306  0.361858        all  Early onset cardiomyopathy   
14   0.501809  0.760216  0.250904        all             Noonan syndrome   
15   0.832030  0.597720  0.416015        all  Primary ciliary dyskinesia   
16   0.915543  0.457772  0.556223        all     Developmental disorders   
17   0.128500  0.064250  0.940074        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign  n_tot  \
0   0.875978  0.033569               Neurogenetics     1920     4144   6064   
1   0.832012  0.000000                 Amyloidosis       28       22     50   
2   0.737692  0.172685              Cardiovascular     3016     3950   6966   
3   0.874161  0.018260    Primary Immunodeficiency     1083     1827   2910   
4   0.849056  0.027538                        Skin      657     1177   1834   
5   0.850584  0.014825                    Epilepsy     1027     2869   3896   
6   0.876923  0.000000                  Angioedema        3       12     15   
7   0.870003  0.015059                   Metabolic     3054    10528  13582   
8   0.890876  0.000000    Hyper-/ hypophosphatemia       32      342    374   
9   0.885757  0.000000                Mitochondria      469     1282   1751   
10  0.911060  0.030794                   Fertility       69      316    385   
11  0.819880  0.008610    Congenital heart defects     2688     4204   6892   
12  0.855386  0.077833           Hereditary cancer      809     5075   5884   
13  0.871418  0.000000  Early onset cardiomyopathy     2701     6144   8845   
14  0.878156  0.000000             Noonan syndrome      297      792   1089   
15  0.870290  0.000000  Primary ciliary dyskinesia      151      137    288   
16  0.855991  0.000000     Developmental disorders     5649    11396  17045   
17  0.716277  0.000000           Leukemia-Lymphoma       85      197    282   

   n_train  
0     6064  
1       50  
2     6966  
3     2910  
4     1834  
5     3896  
6       15  
7    13582  
8      374  
9     1751  
10     385  
11    6892  
12    5884  
13    8845  
14    1089  
15     288  
16   17045  
17     282  
The mean of the M-W analysis AUC: 0.8511943626891942

Base | Optimal hyper | balanced ds

(Back)

In [2]:
full_auc_analysis(curr_setup = 'Base | Optimal hyper | balanced ds', 
                  training_set_loc='./test_output/model_2_0/random_hyper/base/balanced/train_balanced_dataset.tsv.gz',
                  train_loc='./test_output/model_2_0/random_hyper/base/balanced/base_balanced_train.txt',
                  test_loc='./test_output/model_2_0/random_hyper/base/balanced/base_balanced_test.txt', 
                  auc_analysis_name='auc_analysis_randomhyper_base_balanced.csv', 
                  model='./test_output/model_2_0/random_hyper/base/balanced/xgb_optimal_model.pickle.dat',
                  filter_out='./test_output/model_2_0/random_hyper/base/balanced/train_balanced_dataset.tsv.gz'
                 )
Parameter learning_rate is set to 0.09666968094405538
Parameter n_estimators is set to 563
Parameter max_depth is set to 11
There are 51882 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.9303817411916423
AUC analysis of the testing dataset reveals AUC: 0.944057490800174
File ./not_saving_directory/auc_analysis_randomhyper_base_balanced.csv found. Loading.
Top 10 worst performing genes: 
          gene       auc        f1  recall  fpr  precision  n_benign  \
568       SORD  0.000000  0.500000     1.0  0.0   0.333333         2   
1641    MRPS34  0.000000  0.666667     1.0  0.0   0.500000         1   
766     SLC6A9  0.000000  0.500000     1.0  0.0   0.333333         2   
1654      SMPX  0.000000  0.500000     1.0  0.0   0.333333         2   
1432     TCF20  0.000000  0.923077     1.0  0.0   0.857143         1   
1682  ARHGAP29  0.000000  0.800000     1.0  0.0   0.666667         1   
1122      PIGW  0.000000  0.666667     1.0  0.0   0.500000         1   
1542     MEIS2  0.000000  0.333333     1.0  0.0   0.200000         4   
762       KRT1  0.104762  0.000000     0.0  1.0   0.000000       105   
651    TMEM231  0.111111  0.000000     0.0  1.0   0.000000        18   

      n_malign  n_tot  n_train  n_test  
568          1      3        2       1  
1641         1      2        2       0  
766          1      3        3       0  
1654         1      3        2       1  
1432         6      7        5       2  
1682         2      3        2       1  
1122         1      2        2       0  
1542         1      5        4       1  
762          1    106       98       8  
651          1     19       19       0  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.109057  0.946619  0.054529        all               Neurogenetics   
1    1.000000  0.500000  0.514100        all                 Amyloidosis   
2    0.001179  0.000589  0.999430        all              Cardiovascular   
3    0.070294  0.965847  0.035147        all    Primary Immunodeficiency   
4    0.353557  0.176779  0.826567        all                        Skin   
5    0.332064  0.166032  0.836655        all                    Epilepsy   
6    0.229405  0.892002  0.114702        all                  Angioedema   
7    0.523869  0.743878  0.261934        all                   Metabolic   
8    0.501809  0.760216  0.250904        all    Hyper-/ hypophosphatemia   
9    0.273147  0.871013  0.136573        all                Mitochondria   
10   0.045454  0.978363  0.022727        all               Preconception   
11   0.132202  0.066101  0.936507        all    Congenital heart defects   
12   0.435939  0.784530  0.217970        all           Hereditary cancer   
13   0.671422  0.677080  0.335711        all  Early onset cardiomyopathy   
14   0.479569  0.771061  0.239784        all             Noonan syndrome   
15   0.571668  0.285834  0.726062        all  Primary ciliary dyskinesia   
16   0.971801  0.485900  0.528182        all     Developmental disorders   
17   0.119853  0.059926  0.944166        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho  n_tot  \
0   0.869192  0.041231               Neurogenetics     1920    4144   6064   
1   0.855497  0.000000                 Amyloidosis       28      22     50   
2   0.746953  0.183538              Cardiovascular     3016    3950   6966   
3   0.883508  0.029777    Primary Immunodeficiency     1083    1827   2910   
4   0.846104  0.021193                        Skin      657    1177   1834   
5   0.841857  0.021160                    Epilepsy     1027    2869   3896   
6   0.901923  0.000000                  Angioedema        3      12     15   
7   0.865926  0.020271                   Metabolic     3054   10528  13582   
8   0.875304  0.000000    Hyper-/ hypophosphatemia       32     342    374   
9   0.890998  0.000000                Mitochondria      469    1282   1751   
10  0.903567  0.017995               Preconception       69     316    385   
11  0.826526  0.013833    Congenital heart defects     2688    4204   6892   
12  0.854261  0.076323           Hereditary cancer      809    5075   5884   
13  0.868150  0.000000  Early onset cardiomyopathy     2701    6144   8845   
14  0.875937  0.000000             Noonan syndrome      297     792   1089   
15  0.840652  0.000000  Primary ciliary dyskinesia      151     137    288   
16  0.853992  0.000000     Developmental disorders     5649   11396  17045   
17  0.724644  0.000000           Leukemia-Lymphoma       85     197    282   

   n_train  
0     6064  
1       50  
2     6966  
3     2910  
4     1834  
5     3896  
6       15  
7    13582  
8      374  
9     1751  
10     385  
11    6892  
12    5884  
13    8845  
14    1089  
15     288  
16   17045  
17     282  
The mean of the M-W analysis AUC: 0.8513883576840361

Base | Optimal hyper | unbalanced ds

(Back)

In [4]:
full_auc_analysis(curr_setup = 'Base | Optimal hyper | unbalanced ds', 
                  training_set_loc='./datafiles/train.txt.gz',
                  train_loc='./test_output/model_2_0/random_hyper/base/unbalanced/base_unbalanced_train.txt',
                  test_loc='./test_output/model_2_0/random_hyper/base/unbalanced/base_unbalanced_test.txt', 
                  auc_analysis_name='auc_analysis_randomhyper_base_unbalanced.csv', 
                  model='./test_output/model_2_0/random_hyper/base/unbalanced/xgb_optimal_model.pickle.dat',
                  filter_out='./datafiles/train.txt.gz'
                 )

# Note, Angioedema is not shown in here, 
# since test.txt.gz only contains 3 pathogenic variants for this panel.
Parameter learning_rate is set to 0.3313738741131076
Parameter n_estimators is set to 300
Parameter max_depth is set to 13
There are 334601 samples in the training set.
AUC analysis of the testing dataset reveals AUC: 0.9001356915404205
File ./not_saving_directory/auc_analysis_randomhyper_base_unbalanced.csv found. Loading.
Top 10 worst performing genes: 
       gene  auc        f1  recall  fpr  precision  n_benign  n_patho  n_tot  \
160   POMT2  0.0  0.750000     1.0  0.0   0.600000         2        3      5   
41      DSP  0.0  0.956522     1.0  0.0   0.916667         1       11     12   
54      MPZ  0.0  0.941176     1.0  0.0   0.888889         1        8      9   
72   EXOC6B  0.0  0.500000     1.0  0.0   0.333333         2        1      3   
93     SDHD  0.0  0.923077     1.0  0.0   0.857143         1        6      7   
158   NFKB2  0.0  0.500000     1.0  0.0   0.333333         2        1      3   
176     ELN  0.0  0.571429     1.0  0.0   0.400000         3        2      5   
180    TLK2  0.0  0.666667     1.0  0.0   0.500000         1        1      2   
183    SDHA  0.0  0.888889     1.0  0.0   0.800000         1        4      5   
189   DUOX2  0.0  0.666667     1.0  0.0   0.500000         1        1      2   

     n_train  n_test  
160        0       5  
41         0      12  
54         0       9  
72         0       3  
93         0       7  
158        0       3  
176        0       5  
180        0       2  
183        0       5  
189        0       2  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:496: UserWarning: Category Angioedema did not contain enough datapoints for Mann-Whitney analysis!
  warnings.warn(f"Category {category} did not contain enough datapoints for Mann-Whitney analysis!")
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.234522  0.884788  0.117261        all               Neurogenetics   
1    0.658568  0.329284  0.683405        all                 Amyloidosis   
2    0.338862  0.832985  0.169431        all              Cardiovascular   
3    0.402637  0.802309  0.201318        all    Primary Immunodeficiency   
4    0.590243  0.295121  0.709341        all                        Skin   
5    0.172066  0.915656  0.086033        all                    Epilepsy   
6         NaN       NaN       NaN        all                  Angioedema   
7    0.318993  0.159497  0.844824        all                   Metabolic   
8    1.000000  0.500000  0.514100        all    Hyper-/ hypophosphatemia   
9    0.179159  0.089580  0.916006        all                Mitochondria   
10   0.397790  0.806810  0.198895        all               Preconception   
11   0.397790  0.198895  0.806810        all    Congenital heart defects   
12   0.008948  0.004474  0.995636        all           Hereditary cancer   
13   0.804553  0.402277  0.611339        all  Early onset cardiomyopathy   
14   0.777322  0.388661  0.624819        all             Noonan syndrome   
15   0.859700  0.583988  0.429850        all  Primary ciliary dyskinesia   
16   0.915540  0.457770  0.556225        all     Developmental disorders   
17   0.111653  0.055826  0.948042        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho  n_tot  \
0   0.767613  0.099810               Neurogenetics    23659    6110  29769   
1   0.700000  0.000000                 Amyloidosis      367      23    390   
2   0.659012  0.195325              Cardiovascular    29144    5914  35058   
3   0.519867  0.028096    Primary Immunodeficiency    14028    2931  16959   
4   0.662514  0.107173                        Skin     8947    1714  10661   
5   0.784368  0.054484                    Epilepsy    14220    4052  18272   
6        NaN       NaN                  Angioedema       66      15     81   
7   0.615150  0.213211                   Metabolic    44555   15463  60018   
8   0.750000  0.000000    Hyper-/ hypophosphatemia      579     478   1057   
9   0.505328  0.000000                Mitochondria     6705    1762   8467   
10  0.833333  0.166667               Preconception      643     471   1114   
11  0.688658  0.016754    Congenital heart defects    23106    6384  29490   
12  0.638193  0.177201           Hereditary cancer    16869    9907  26776   
13  0.705968  0.000000  Early onset cardiomyopathy    30999   10073  41072   
14  0.702897  0.000000             Noonan syndrome     2524    1172   3696   
15  0.766667  0.000000  Primary ciliary dyskinesia     1534     265   1799   
16  0.729930  0.000000     Developmental disorders    59089   17262  76351   
17  0.333333  0.000000           Leukemia-Lymphoma     1227     237   1464   

   n_train  
0    29769  
1      390  
2    35058  
3    16959  
4    10661  
5    18272  
6       81  
7    60018  
8     1057  
9     8467  
10    1114  
11   29490  
12   26776  
13   41072  
14    3696  
15    1799  
16   76351  
17    1464  
The mean of the M-W analysis AUC: 0.6684017909411496

Preparing training data for Cardiovascular

Index

In [8]:
hevz = []
for key, value in genepanels['Hart- en vaatziekten'].items():
    for gene in value:
        if gene not in hevz:
            hevz.append(gene)
hevz
Out[8]:
['ABCC8',
 'ACTA2',
 'ACVRL1',
 'AQP1',
 'ATP13A3',
 'BMPR1B',
 'BMPR2',
 'CAV1',
 'EIF2AK4',
 'ENG',
 'GDF2',
 'GGCX',
 'KCNA5',
 'KCNK3',
 'KDR',
 'KLF2',
 'KLK1',
 'SMAD4',
 'SMAD9',
 'SOX17',
 'TBX4',
 'ABCC9',
 'AKAP9',
 'ANK2',
 'ASPH',
 'CACNA1C',
 'CACNA1D',
 'CACNA2D1',
 'CACNB2',
 'CALM1',
 'CALM2',
 'CALM3',
 'CASQ2',
 'CAV3',
 'GJA5',
 'GNB2',
 'GPD1L',
 'HCN4',
 'JPH2',
 'KCND3',
 'KCNE1',
 'KCNE2',
 'KCNE3',
 'KCNE1L',
 'KCNH2',
 'KCNJ2',
 'KCNJ5',
 'KCNJ8',
 'KCNQ1',
 'LAMP2',
 'LMNA',
 'MYL4',
 'NKX2-5',
 'NPPA',
 'PKP2',
 'PLN',
 'PPA2',
 'PRKAG2',
 'RANGRF',
 'RYR2',
 'SCN10A',
 'SCN1B',
 'SCN2B',
 'SCN3B',
 'SCN4B',
 'SCN5A',
 'SLMAP',
 'SNTA1',
 'TECRL',
 'TNNI3K',
 'TNNT2',
 'TRDN',
 'TRPM4',
 'CYP27A1',
 'SAR1B',
 'APOE',
 'ABCA1',
 'ABCG5',
 'ABCG8',
 'ANGPTL3',
 'APOA1',
 'APOA5',
 'APOB',
 'APOC2',
 'APOC3',
 'CETP',
 'CYP7A1',
 'GPD1',
 'GPIHBP1',
 'LCAT',
 'LDLR',
 'LDLRAP1',
 'LIPA',
 'LIPC',
 'LIPG',
 'LMF1',
 'LPL',
 'MTTP',
 'MYLIP',
 'PCSK9',
 'SCARB1',
 'SLCO1B1',
 'STAP1',
 'ACTC1',
 'ACTN2',
 'ALPK3',
 'ANKRD1',
 'ANO5',
 'BAG3',
 'CALR3',
 'CRYAB',
 'CSRP3',
 'CTNNA3',
 'DES',
 'DMD',
 'DSC2',
 'DSG2',
 'DSP',
 'DTNA',
 'EMD',
 'FHL1',
 'FKTN',
 'FLNC',
 'GLA',
 'ILK',
 'JUP',
 'LAMA4',
 'LDB3',
 'MIB1',
 'MYBPC3',
 'MYH6',
 'MYH7',
 'MYL2',
 'MYL3',
 'MYLK2',
 'MYOZ1',
 'MYOZ2',
 'MYPN',
 'NEXN',
 'PRDM16',
 'RBM20',
 'SGCD',
 'TAZ',
 'TBX20',
 'TCAP',
 'TMEM43',
 'TNNC1',
 'TNNI3',
 'TPM1',
 'TTN',
 'TTR',
 'TXNRD2',
 'VCL',
 'BRAF',
 'CBL',
 'HRAS',
 'KRAS',
 'LZTR1',
 'MAP2K1',
 'MAP2K2',
 'MAP3K7',
 'NF1',
 'NRAS',
 'PTPN11',
 'RAF1',
 'RIT1',
 'SHOC2',
 'SOS1',
 'SPRED1']
In [9]:
training_dataset = pd.read_csv('./datafiles/train.txt.gz', sep='\t', low_memory=False)
training_dataset
Out[9]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
0 14 False G CodingTranscript False False CCDS9787.1 806.0 False frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.843 False 1.0
1 20 False T CodingTranscript True False CCDS13112.1 1899.0 True frameshift,stop_gained ... NaN NaN vkgl NaN NaN False 1.000 4.670 False 1.0
2 20 False C CodingTranscript True False CCDS13112.1 2118.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.043 False 1.0
3 20 False A CodingTranscript True False CCDS13112.1 1586.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 6.221 False 1.0
4 20 False A Intergenic True False NaN NaN True downstream ... NaN NaN vkgl NaN NaN False 1.000 6.368 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334596 17 False A CodingTranscript False False CCDS32642.1 1563.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 6.031 False 0.8
334597 17 False T CodingTranscript False False CCDS32642.1 2029.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 4.100 False 0.8
334598 10 False T CodingTranscript False False CCDS7431.1 1216.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 5.852 False 0.8
334599 2 False T CodingTranscript False False CCDS2382.1 2998.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.031 2.213 False 0.8
334600 5 False T CodingTranscript False False CCDS3952.1 1221.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 0.528 False 0.8

334601 rows × 152 columns

In [10]:
for c in training_dataset.columns:
    print(c)
#Chrom
Allergy/Immunology/Infectious
Alt
AnnoType
Audiologic/Otolaryngologic
Biochemical
CCDS
CDSpos
Cardiovascular
ConsDetail
ConsScore
Consequence
CpG
Craniofacial
Dental
Dermatologic
Dist2Mutation
Domain
Dst2SplType
Dst2Splice
EncExp
EncH3K27Ac
EncH3K4Me1
EncH3K4Me3
EncNucleo
EncOCC
EncOCCombPVal
EncOCDNasePVal
EncOCDNaseSig
EncOCFairePVal
EncOCFaireSig
EncOCctcfPVal
EncOCctcfSig
EncOCmycPVal
EncOCmycSig
EncOCpolIIPVal
EncOCpolIISig
Endocrine
Exon
FeatureID
Freq10000bp
Freq1000bp
Freq100bp
GC
Gastrointestinal
GeneID
GeneName
General
Genitourinary
GerpN
GerpRS
GerpRSpval
GerpS
Grantham
Hematologic
Intron
Length
Musculoskeletal
Neurologic
Obstetric
Oncologic
Ophthalmologic
PHRED
PolyPhenCat
PolyPhenVal
Pos
Pulmonary
Rare10000bp
Rare1000bp
Rare100bp
RawScore
Ref
Renal
SIFTcat
SIFTval
Segway
Sngl10000bp
Sngl1000bp
Sngl100bp
Stars
TFBS
TFBSPeaks
TFBSPeaksMax
Type
allvalid
bStatistic
binarized_label
cDNApos
cHmmBivFlnk
cHmmEnh
cHmmEnhBiv
cHmmEnhG
cHmmHet
cHmmQuies
cHmmReprPC
cHmmReprPCWk
cHmmTssA
cHmmTssAFlnk
cHmmTssBiv
cHmmTx
cHmmTxFlnk
cHmmTxWk
cHmmZnfRpts
chr_pos_ref_alt
chr_pos_ref_alt_gene
clinpred
dbscSNV-ada_score
dbscSNV-rf_score
fathmm_score
inClinvar
inClinvar1Star
inClinvar2Star
inVKGLInsertion
inheritance
isAR
isInsertion
isPopulation
isVKGL_needsFurtherCorrection
label
mamPhCons
mamPhyloP
max_AF
minDistTSE
minDistTSS
mirSVR-Aln
mirSVR-E
mirSVR-Score
motifDist
motifECount
motifEHIPos
motifEName
motifEScoreChng
nAA
notinTest1
oAA
priPhCons
priPhyloP
protPos
provean
relCDSpos
relProtPos
relcDNApos
revel
sift
source
tOverlapMotifs
targetScan
to_be_deleted
verPhCons
verPhyloP
inTest
sample_weight
In [11]:
training_dataset[training_dataset['GeneName'].isin(hevz)]
Out[11]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
18 15 False AA CodingTranscript False False CCDS42016.1 1737.0 True frameshift ... NaN NaN vkgl NaN NaN True 1.000 4.472 False 1.0
21 7 False GG CodingTranscript False False CCDS5928.1 431.0 True frameshift ... NaN NaN vkgl NaN NaN True 0.995 3.815 False 1.0
22 11 False G CodingTranscript False False CCDS53621.1 884.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 4.533 False 1.0
23 11 False A CodingTranscript False False CCDS53621.1 2532.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.262 False 1.0
24 11 False AGC CodingTranscript False False CCDS53621.1 3685.0 True frameshift ... NaN NaN vkgl NaN NaN True 1.000 3.135 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334579 12 False A CodingTranscript False False CCDS8693.1 2080.0 True stop_gained ... NaN NaN unknown NaN NaN False 1.000 3.492 False 0.8
334580 12 False T CodingTranscript False False CCDS8693.1 1529.0 True stop_gained ... NaN NaN unknown NaN NaN False 1.000 5.757 False 0.8
334581 12 False C CodingTranscript False False CCDS8693.1 1119.0 True stop_gained ... NaN NaN unknown NaN NaN False 1.000 4.145 False 0.8
334582 12 False CATTTTCAT CodingTranscript False False CCDS8693.1 1034.0 True frameshift,stop_gained ... NaN NaN unknown NaN NaN False 1.000 0.938 False 0.8
334583 12 False T CodingTranscript False False CCDS8693.1 671.0 True stop_gained ... NaN NaN unknown NaN NaN False 1.000 5.889 False 0.8

35058 rows × 152 columns

In [12]:
filename = './datafiles/cardiovascular.txt.gz'
if not os.path.isfile(filename):
    training_dataset[training_dataset['GeneName'].isin(hevz)].to_csv('./datafiles/cardiovascular.txt.gz', compression='gzip', index=False, sep='\t')

Preparing data for Dyslipid.

Index

In [13]:
dyslipid_related = []
for key, value in genepanels['Hart- en vaatziekten'].items():
    if key.lower().startswith('dyslipid'):
        for gene in value:
            if gene not in dyslipid_related:
                dyslipid_related.append(gene)
dyslipid_related
Out[13]:
['CYP27A1',
 'SAR1B',
 'APOE',
 'ABCA1',
 'ABCG5',
 'ABCG8',
 'ANGPTL3',
 'APOA1',
 'APOA5',
 'APOB',
 'APOC2',
 'APOC3',
 'CETP',
 'CYP7A1',
 'GPD1',
 'GPIHBP1',
 'LCAT',
 'LDLR',
 'LDLRAP1',
 'LIPA',
 'LIPC',
 'LIPG',
 'LMF1',
 'LPL',
 'MTTP',
 'MYLIP',
 'PCSK9',
 'SCARB1',
 'SLCO1B1',
 'STAP1']
In [14]:
training_dataset[training_dataset['GeneName'].isin(dyslipid_related)]
Out[14]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
142 16 False T CodingTranscript False False CCDS10772.1 848.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.928 2.614 False 1.0
148 19 False CCGGCGAGGTGCAGGCCATGCT CodingTranscript False False CCDS12647.1 409.0 True protein_altering ... NaN NaN vkgl NaN NaN True 0.863 0.839 False 1.0
149 2 False C CodingTranscript False False CCDS1703.1 13028.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.000 0.058 False 1.0
150 2 False G CodingTranscript False False CCDS1703.1 28.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.021 -0.103 False 1.0
151 2 False C CodingTranscript False False CCDS1703.1 2534.0 True frameshift ... NaN NaN vkgl NaN NaN False 0.653 0.251 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334284 12 False T CodingTranscript False False CCDS8685.1 1093.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.999 3.072 False 0.8
334285 12 False A CodingTranscript False False CCDS8685.1 1475.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.580 0.563 False 0.8
334286 12 False T CodingTranscript False False CCDS8685.1 1537.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.003 0.643 False 0.8
334287 12 False G CodingTranscript False False CCDS8685.1 1553.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.510 2.765 False 0.8
334288 12 False G CodingTranscript False False CCDS8685.1 1634.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.000 -0.326 False 0.8

5029 rows × 152 columns

In [15]:
filename = './datafiles/dyslipid.txt.gz'
if not os.path.isfile(filename):
    training_dataset[training_dataset['GeneName'].isin(dyslipid_related)].to_csv(filename, compression='gzip', index=False, sep='\t')

Preparing data for Neurogenetics.

Index

In [16]:
neuro = []
for key, value in genepanels['Neurogenetica'].items():
    for gene in value:
        if gene not in neuro:
            neuro.append(gene)
neuro
Out[16]:
['APP',
 'APOE',
 'CHMP2B',
 'CSF1R',
 'FUS',
 'GRN',
 'MAPT',
 'NPC1',
 'PRNP',
 'PSEN1',
 'PSEN2',
 'SIGMAR1',
 'SORL1',
 'TARDBP',
 'TREM2',
 'UBE3A',
 'UBQLN2',
 'VCP',
 'ADH1C',
 'ATP13A2',
 'ATP1A3',
 'DNAJC13',
 'DNAJC6',
 'EIF4G1',
 'FBXO7',
 'GBA',
 'GIGYF2',
 'HTRA2',
 'LRRK2',
 'PARK2',
 'PARK7',
 'PINK1',
 'PLA2G6',
 'PRKRA',
 'SLC6A3',
 'SNCA',
 'SYNJ1',
 'TAF1',
 'UCHL1',
 'VPS35',
 'C19orf12',
 'CP',
 'FA2H',
 'FTL',
 'PANK2',
 'SLC30A10',
 'WDR45',
 'AP1S2',
 'FGF17',
 'FOXC1',
 'LAMC1',
 'NID1',
 'ZIC1',
 'ZIC4',
 'ADCK3',
 'ADCY5',
 'AMT',
 'ANO3',
 'ARHGEF9',
 'ASAH1',
 'ATM',
 'ATP7A',
 'ATP7B',
 'BRAT1',
 'CACNA1A',
 'CACNA1B',
 'CACNB4',
 'CAMTA1',
 'CARS2',
 'CASR',
 'CDKL5',
 'CERS1',
 'CHD2',
 'CLCN2',
 'CLN3',
 'CLN5',
 'CLN6',
 'CLN8',
 'CNTN2',
 'CSTB',
 'CYP27A1',
 'DNAJC5',
 'EFHC1',
 'EIF2B5',
 'EPM2A',
 'FOLR1',
 'FOXG1',
 'GABRA1',
 'GABRD',
 'GABRG2',
 'GALC',
 'GCSH',
 'GFAP',
 'GLDC',
 'GLRA1',
 'GLRB',
 'GNAO1',
 'GOSR2',
 'HEXA',
 'HEXB',
 'KCNC1',
 'KCNC3',
 'KCND3',
 'KCTD17',
 'KCTD7',
 'KMT2B',
 'MECP2',
 'MFSD8',
 'NEU1',
 'NHLRC1',
 'NKX2-1',
 'NPC2',
 'PCDH19',
 'PIGA',
 'POLG',
 'PPT1',
 'PRICKLE1',
 'PRKCG',
 'PSAP',
 'RELN',
 'RNASEH2A',
 'RNASEH2B',
 'RNASEH2C',
 'RPS6KA3',
 'SACS',
 'SAMHD1',
 'SCARB2',
 'SCN1A',
 'SCN2A',
 'SCN8A',
 'SCN9A',
 'SGCE',
 'SLC2A1',
 'SLC6A1',
 'SLC6A5',
 'STXBP1',
 'TBC1D24',
 'TH',
 'TPP1',
 'TREX1',
 'SERPINI1',
 'ABCD1',
 'AP4B1',
 'AP4E1',
 'AP4M1',
 'AP4S1',
 'AP5Z1',
 'ATL1',
 'B4GALNT1',
 'BSCL2',
 'C12orf65',
 'CAPN1',
 'CCT5',
 'CYP2U1',
 'CYP7B1',
 'DDHD1',
 'DDHD2',
 'ENTPD1',
 'ERLIN2',
 'FAM134B',
 'GBA2',
 'GBE1',
 'GJC2',
 'HSPD1',
 'KIAA0196',
 'KIF1A',
 'KIF1C',
 'KIF5A',
 'L1CAM',
 'MTPAP',
 'NIPA1',
 'PLP1',
 'PNPLA6',
 'RAB3GAP2',
 'REEP1',
 'RTN2',
 'SLC16A2',
 'SLC33A1',
 'SPAST',
 'SPG11',
 'SPG20',
 'SPG21',
 'SPG7',
 'TECPR2',
 'TFG',
 'VAMP1',
 'VPS37A',
 'WWOX',
 'ZFYVE26',
 'ZFYVE27',
 'ADAR',
 'ALDH5A1',
 'ARX',
 'BCS1L',
 'C10orf2',
 'CIZ1',
 'COX10',
 'COX15',
 'COX20',
 'DDC',
 'DLAT',
 'DLD',
 'GCDH',
 'GCH1',
 'GNAL',
 'IRF2BPL',
 'LRPPRC',
 'MTTP',
 'NDUFA10',
 'NDUFA12',
 'NDUFA2',
 'NDUFA9',
 'NDUFAF2',
 'NDUFAF5',
 'NDUFAF6',
 'NDUFS1',
 'NDUFS3',
 'NDUFS4',
 'NDUFS7',
 'NDUFS8',
 'NUP62',
 'PAH',
 'PCBD1',
 'PDHA1',
 'PDHB',
 'PDHX',
 'PNKD',
 'PRRT2',
 'PTS',
 'QDPR',
 'SCO2',
 'SERAC1',
 'SLC19A3',
 'SLC20A1',
 'SLC6A19',
 'SPR',
 'SUCLA2',
 'SUCLG1',
 'SURF1',
 'TACO1',
 'THAP1',
 'TIMM8A',
 'TOR1A',
 'TTPA',
 'TUBB4A',
 'VPS13A',
 'VPS13D',
 'ABHD12',
 'AFG3L2',
 'ANO10',
 'APTX',
 'ATCAY',
 'ATP2B3',
 'CACNA1G',
 'CCDC88C',
 'DNMT1',
 'EEF2',
 'ELOVL4',
 'ELOVL5',
 'FAT2',
 'FGF14',
 'FLVCR1',
 'FXN',
 'GDAP2',
 'GRID2',
 'GRM1',
 'IFRD1',
 'ITPR1',
 'KCNA1',
 'KIAA0226',
 'KIF26B',
 'MRE11A',
 'OPA1',
 'PDYN',
 'PEX10',
 'PEX7',
 'PHYH',
 'PIK3R5',
 'PLD3',
 'PMM2',
 'PNKP',
 'RNF170',
 'RNF216',
 'SETX',
 'SIL1',
 'SLC1A3',
 'SPTBN2',
 'STUB1',
 'SYNE1',
 'SYT14',
 'TDP1',
 'TGM6',
 'TMEM240',
 'TRPC4',
 'TTBK2',
 'VWA3B',
 'ZNF592',
 'AHI1',
 'ARL13B',
 'ARL3',
 'ARMC9',
 'B9D1',
 'B9D2',
 'C2CD3',
 'CC2D2A',
 'CELSR2',
 'CEP104',
 'CEP120',
 'CEP290',
 'CEP41',
 'CPLANE1',
 'CSPP1',
 'EXOC8',
 'GLI3',
 'INPP5E',
 'KIAA0556',
 'KIAA0586',
 'KIAA0753',
 'KIF7',
 'MKKS',
 'MKS1',
 'NPHP1',
 'NPHP3',
 'NPHP4',
 'OFD1',
 'PDE6D',
 'PIBF1',
 'POC1B',
 'RPGRIP1L',
 'SUFU',
 'TCTN1',
 'TCTN2',
 'TCTN3',
 'TMEM107',
 'TMEM138',
 'TMEM17',
 'TMEM216',
 'TMEM231',
 'TMEM237',
 'TMEM67',
 'TTC21B',
 'ZNF423',
 'AMPD2',
 'CASK',
 'CHMP1A',
 'CLP1',
 'EXOSC3',
 'EXOSC8',
 'EXOSC9',
 'PCLO',
 'RARS2',
 'SEPSECS',
 'SLC25A46',
 'TBC1D23',
 'TOE1',
 'TSEN15',
 'TSEN2',
 'TSEN34',
 'TSEN54',
 'VPS53',
 'VRK1',
 'AARS2',
 'ABCB7',
 'ACO2',
 'ACOX1',
 'ARHGEF2',
 'ARSA',
 'ATP8A2',
 'AUH',
 'CA8',
 'CACNA2D2',
 'CTBP1',
 'CWF19L1',
 'CYB5R3',
 'DKC1',
 'DNAJC3',
 'EBF3',
 'EIF2B2',
 'EIF2B3',
 'EIF2B4',
 'ERCC2',
 'ERCC3',
 'ERCC6',
 'ERCC8',
 'GALT',
 'GPSM2',
 'GTF2H5',
 'HSD17B4',
 'KCNJ10',
 'L2HGDH',
 'LAMA1',
 'MCOLN1',
 'MED17',
 'MED20',
 'MVK',
 'OPHN1',
 'PAX6',
 'PCNA',
 'PIGN',
 'PMPCA',
 'POLR3A',
 'POLR3B',
 'PTF1A',
 'QARS',
 'ROBO3',
 'SCYL1',
 'SLC17A5',
 'SLC52A2',
 'SLC9A1',
 'SLC9A6',
 'SNX14',
 'SYNGAP1',
 'TDP2',
 'THG1L',
 'UBA5',
 'VLDLR',
 'WDR73',
 'WDR81',
 'WFS1']
In [17]:
training_dataset[training_dataset['GeneName'].isin(neuro)]
Out[17]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
49 6 False T CodingTranscript False False CCDS5004.1 1837.0 False frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.778 False 1.0
114 3 False AT CodingTranscript False False CCDS2925.1 666.0 False frameshift ... NaN NaN vkgl NaN NaN True 1.000 5.747 False 1.0
128 2 False G CodingTranscript True True CCDS2419.1 398.0 False frameshift ... NaN NaN vkgl NaN NaN False 1.000 4.997 False 1.0
129 2 False C CodingTranscript True True CCDS2419.1 940.0 False frameshift ... NaN NaN vkgl NaN NaN False 1.000 3.298 False 1.0
143 15 False TT CodingTranscript False False CCDS42029.1 1207.0 False frameshift ... NaN NaN vkgl NaN NaN True 1.000 5.600 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334455 11 False A CodingTranscript False False CCDS31452.1 163.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 5.586 False 0.8
334511 10 False A CodingTranscript False False CCDS31314.1 1381.0 False stop_gained ... 0.173 NaN unknown NaN NaN False 1.000 3.144 False 0.8
334512 10 False A CodingTranscript False False CCDS31314.1 1165.0 False splice,stop_gained ... NaN NaN unknown NaN NaN False 1.000 6.319 False 0.8
334573 X False A CodingTranscript False True CCDS14728.1 230.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 1.741 False 0.8
334574 X False T CodingTranscript False True CCDS14728.1 1699.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.115 0.142 False 0.8

29769 rows × 152 columns

In [18]:
filename = './datafiles/neurogenetics.txt.gz'
if not os.path.isfile(filename):
    training_dataset[training_dataset['GeneName'].isin(neuro)].to_csv(filename, compression='gzip', index=False, sep='\t')

Preparing data for Hereditary cancer

Index

In [19]:
hc = []
for key, value in genepanels['Erfelijke Kanker'].items():
    for gene in value:
        if gene not in hc:
            hc.append(gene)
hc
Out[19]:
['ATM',
 'BRCA1',
 'BRCA2',
 'CHEK2',
 'PALB2',
 'TP53',
 'EPCAM',
 'MLH1',
 'MSH2',
 'MSH6',
 'MUTYH',
 'PMS2',
 'POLD1',
 'POLE',
 'APC',
 'AXIN2',
 'BMPR1A',
 'ENG',
 'MSH3',
 'NTHL1',
 'PTEN',
 'RNF43',
 'SMAD4',
 'STK11',
 'BRIP1',
 'RAD51C',
 'RAD51D',
 'FH',
 'MAX',
 'MDH2',
 'NF1',
 'RET',
 'SDHA',
 'SDHAF2',
 'SDHB',
 'SDHC',
 'SDHD',
 'TMEM127',
 'VHL',
 'AIP',
 'CDKN1A',
 'CDKN1B',
 'CDKN2B',
 'CDKN2C',
 'MEN1',
 'PRKAR1A',
 'ACD',
 'ACTRT1',
 'BAP1',
 'CDK4',
 'CDKN2A',
 'ERCC2',
 'MITF',
 'POLH',
 'POT1',
 'PTCH1',
 'PTCH2',
 'SUFU',
 'TERF2IP',
 'TERT',
 'DICER1',
 'LZTR1',
 'NF2',
 'SMARCA4',
 'SMARCB1',
 'SMARCE1',
 'TSC1',
 'TSC2',
 'ALK',
 'PHOX2B',
 'WT1',
 'CEBPA',
 'DDX41',
 'GATA2',
 'PAX5',
 'RUNX1',
 'CDH1',
 'CTNNA1',
 'FLCN',
 'MET',
 'PALLD',
 'HOXB13',
 'RB1',
 'EGFR',
 'CDC73',
 'KIT',
 'PDGFRA']
In [20]:
training_dataset[training_dataset['GeneName'].isin(hc)]
Out[20]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
55 2 False A CodingTranscript False False CCDS1836.1 741.0 False frameshift ... NaN NaN vkgl NaN NaN False 0.839 -0.283 False 1.0
56 2 False TA CodingTranscript False False CCDS1836.1 1944.0 False frameshift ... NaN NaN vkgl NaN NaN True 0.001 0.466 False 1.0
57 2 False CC CodingTranscript False False CCDS1836.1 3259.0 False frameshift ... NaN NaN vkgl NaN NaN True 1.000 3.249 False 1.0
58 2 False C CodingTranscript False False CCDS1836.1 102.0 False frameshift ... NaN NaN vkgl NaN NaN False 0.000 -0.743 False 1.0
59 2 False AT CodingTranscript False False CCDS1836.1 395.0 False frameshift ... NaN NaN vkgl NaN NaN True 1.000 5.138 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334504 5 False T CodingTranscript False False CCDS4107.1 3925.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 3.528 False 0.8
334505 5 False GACAAAGA CodingTranscript False False CCDS4107.1 4160.0 False frameshift,stop_gained ... NaN NaN unknown NaN NaN False 1.000 5.909 False 0.8
334533 17 False A CodingTranscript False False CCDS45646.1 271.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.003 0.639 False 0.8
334534 17 False A CodingTranscript False False CCDS45646.1 184.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.003 0.266 False 0.8
334535 17 False C CodingTranscript False False CCDS45646.1 161.0 False stop_gained ... 0.017 NaN unknown NaN NaN False 0.004 0.318 False 0.8

26776 rows × 152 columns

In [21]:
filename = './datafiles/hereditarycancer.txt.gz'
if not os.path.isfile(filename):
    training_dataset[training_dataset['GeneName'].isin(hc)].to_csv(filename, compression='gzip', index=False, sep='\t')

AUC analysis of dyslipid model

Index

In [22]:
test_dyslipid = read_capice_output('./test_output/test_results_dyslipid_correct_threshold.txt')
test_dyslipid
Out[22]:
GeneName Consequence PHRED probabilities prediction combined_prediction chr pos ref alt
0 BRCA1 STOP_GAINED 34.000 9.999996e-01 Pathogenic Pathogenic 17 41243705 CTGAG GCCT
1 BRCA2 FRAME_SHIFT 26.900 9.999989e-01 Pathogenic Pathogenic 13 32912883 CTG TT
2 BRCA1 FRAME_SHIFT 22.000 9.999983e-01 Pathogenic Pathogenic 17 41245330 CTTTA TTT
3 BRCA1 FRAME_SHIFT 22.000 9.999982e-01 Pathogenic Pathogenic 17 41245404 GTA CT
4 BRCA2 FRAME_SHIFT 22.600 9.999980e-01 Pathogenic Pathogenic 13 32915061 TA T
... ... ... ... ... ... ... ... ... ... ...
10837 BLOC1S1-RDH5 DOWNSTREAM 8.009 1.597412e-07 Neutral Neutral 12 56115746 G C
10838 LOC100289580 UPSTREAM 0.168 7.783050e-08 Neutral Neutral 16 88801032 G T
10839 MYHAS DOWNSTREAM 0.035 6.001463e-08 Neutral Neutral 17 10442488 G A
10840 LOC100289580 UPSTREAM 1.856 4.832236e-08 Neutral Neutral 16 88798720 G A
10841 MIR548AZ NON_SYNONYMOUS 1.347 4.640942e-08 Neutral Neutral 14 64519932 A G

10842 rows × 10 columns

Cardiovascular | Default hyper | balanced

Back

In [10]:
full_auc_analysis(
    curr_setup = 'Cardiovascular, Default Hyper, balanced ds',
    train_loc = './test_output/model_2_0/default_hyper/cardiovascular/balanced/cardiovascular_balanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/cardiovascular/balanced/cardiovascular_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_cardio_balanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/cardiovascular/balanced/train_balanced_dataset.tsv.gz',
    filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
There are 6642 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.7588318255473785
AUC analysis of the testing dataset reveals AUC: 0.6713240620308657
File ./not_saving_directory/auc_analysis_cardio_balanced.csv not found, creating.
I am stilling running, done 4%
I am stilling running, done 9%
I am stilling running, done 13%
I am stilling running, done 18%
I am stilling running, done 22%
I am stilling running, done 27%
I am stilling running, done 32%
I am stilling running, done 37%
I am stilling running, done 42%
I am stilling running, done 47%
I am stilling running, done 52%
I am stilling running, done 57%
I am stilling running, done 62%
I am stilling running, done 68%
I am stilling running, done 73%
I am stilling running, done 79%
I am stilling running, done 84%
I am stilling running, done 89%
I am stilling running, done 95%
Top 10 worst performing genes: 
          gene  auc        f1    recall       fpr  precision n_benign n_patho  \
2278     TEX15  0.0  0.000000  0.000000  1.000000   0.000000        3       1   
2242  KIAA0753  0.0  0.500000  0.666667  0.333333   0.400000        3       3   
1971      PLD1  0.0  0.666667  1.000000  0.000000   0.500000        1       1   
1969  HIST1H1E  0.0  0.285714  1.000000  0.000000   0.166667        5       1   
2319       PTH  0.0  0.000000  0.000000  1.000000   0.000000        3       1   
1165      PIGW  0.0  0.857143  1.000000  0.000000   0.750000        1       3   
1400     ERMAP  0.0  0.000000  0.000000  1.000000   0.000000       21       1   
2269     CDK10  0.0  0.800000  1.000000  0.000000   0.666667        1       2   
2270      FSHB  0.0  0.400000  1.000000  0.000000   0.250000        3       1   
1513      SORD  0.0  0.500000  1.000000  0.000000   0.333333        2       1   

     n_tot n_train n_test  
2278     4       4      0  
2242     6       6      0  
1971     2       2      0  
1969     6       5      1  
2319     4       4      0  
1165     4       4      0  
1400    22      18      4  
2269     3       3      0  
2270     4       4      0  
1513     3       2      1  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.522899  0.261449  0.741946        all               Neurogenetics   
1    0.339858  0.838863  0.169929        all                 Amyloidosis   
2    0.613825  0.696455  0.306912        all              Cardiovascular   
3    0.103449  0.949636  0.051724        all    Primary Immunodeficiency   
4    0.225068  0.889926  0.112534        all                        Skin   
5    0.332064  0.166032  0.836655        all                    Epilepsy   
6    0.103930  0.951688  0.051965        all                  Angioedema   
7    0.878702  0.439351  0.567719        all                   Metabolic   
8    0.179177  0.915998  0.089589        all    Hyper-/ hypophosphatemia   
9    0.229405  0.892002  0.114702        all                Mitochondria   
10   0.364190  0.182095  0.823305        all               Preconception   
11   0.364190  0.182095  0.823305        all    Congenital heart defects   
12   0.084677  0.042339  0.958424        all           Hereditary cancer   
13   0.595941  0.714166  0.297970        all  Early onset cardiomyopathy   
14   0.147242  0.931187  0.073621        all             Noonan syndrome   
15   0.571668  0.285834  0.726062        all  Primary ciliary dyskinesia   
16   0.859705  0.429853  0.583985        all     Developmental disorders   
17   0.943637  0.471818  0.542228        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho n_tot  \
0   0.780080  0.037503               Neurogenetics        7      21    28   
1   0.825701  0.000000                 Amyloidosis       13      13    26   
2   0.764919  0.156468              Cardiovascular     3321    3321  6642   
3   0.826119  0.053637    Primary Immunodeficiency       12      57    69   
4   0.805392  0.022231                        Skin       83     187   270   
5   0.757671  0.086993                    Epilepsy       15       5    20   
6   0.948889  0.000000                  Angioedema        0       0     0   
7   0.780576  0.035064                   Metabolic      546    1590  2136   
8   0.865229  0.000000    Hyper-/ hypophosphatemia        0       0     0   
9   0.838694  0.000000                Mitochondria        6      40    46   
10  0.671987  0.165941               Preconception        0       0     0   
11  0.771809  0.003371    Congenital heart defects      758    1104  1862   
12  0.771949  0.051452           Hereditary cancer      182     447   629   
13  0.804071  0.000000  Early onset cardiomyopathy     1958    2220  4178   
14  0.872504  0.000000             Noonan syndrome      326     649   975   
15  0.771220  0.000000  Primary ciliary dyskinesia        0       0     0   
16  0.783336  0.000000     Developmental disorders      572     995  1567   
17  0.786761  0.000000           Leukemia-Lymphoma        3      57    60   

   n_train  
0       28  
1       26  
2     6642  
3       69  
4      270  
5       20  
6        0  
7     2136  
8        0  
9       46  
10       0  
11    1862  
12     629  
13    4178  
14     975  
15       0  
16    1567  
17      60  
The mean of the M-W analysis AUC: 0.8014948049821656

Cardiovascular | Default hyper | unbalanced

Back

In [3]:
full_auc_analysis(
    curr_setup = 'Cardiovascular, Default Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/default_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_cardio_unbalanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz',
    filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
There are 28046 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.8979080372224252
AUC analysis of the testing dataset reveals AUC: 0.6495063282735525
File ./not_saving_directory/auc_analysis_cardio_unbalanced.csv not found, creating.
I am stilling running, done 4%
I am stilling running, done 9%
I am stilling running, done 14%
I am stilling running, done 18%
I am stilling running, done 23%
I am stilling running, done 28%
I am stilling running, done 33%
I am stilling running, done 39%
I am stilling running, done 44%
I am stilling running, done 50%
I am stilling running, done 56%
I am stilling running, done 62%
I am stilling running, done 68%
I am stilling running, done 74%
I am stilling running, done 80%
I am stilling running, done 86%
I am stilling running, done 92%
I am stilling running, done 98%
Top 10 worst performing genes: 
          gene       auc        f1  recall  fpr  precision n_benign n_malign  \
2252     CDK10  0.000000  0.000000     0.0  1.0   0.000000        1        2   
1394    UBQLN2  0.000000  0.000000     0.0  1.0   0.000000       49        1   
751      ASCC1  0.000000  0.000000     0.0  1.0   0.000000       63        1   
994      PTHLH  0.000000  0.285714     1.0  0.0   0.166667        5        1   
2151      SAA1  0.016393  0.000000     0.0  1.0   0.000000       61        1   
1124       TNC  0.056537  0.000000     0.0  1.0   0.000000      283        1   
871   KIAA0196  0.091503  0.000000     0.0  1.0   0.000000      306        1   
700       MSH3  0.139241  0.000000     0.0  1.0   0.000000       79        1   
1796     FOXC2  0.142857  0.000000     0.0  1.0   0.000000       21        1   
2029     ERMAP  0.142857  0.000000     0.0  1.0   0.000000       21        1   

     n_tot n_train n_test  
2252     3       3      0  
1394    50      50      0  
751     64      56      8  
994      6       4      2  
2151    62      61      1  
1124   284     278      6  
871    307     299      8  
700     80      62     18  
1796    22      18      4  
2029    22      18      4  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.280688  0.140344  0.861978        all               Neurogenetics   
1    0.137624  0.068812  0.935750        all                 Amyloidosis   
2    0.379138  0.189569  0.813025        all              Cardiovascular   
3    0.130638  0.936316  0.065319        all    Primary Immunodeficiency   
4    0.045001  0.022501  0.978184        all                        Skin   
5    0.210662  0.105331  0.896631        all                    Epilepsy   
6    0.832029  0.597720  0.416015        all                  Angioedema   
7    0.892880  0.446440  0.560649        all                   Metabolic   
8    0.288918  0.863427  0.144459        all    Hyper-/ hypophosphatemia   
9    0.358044  0.830072  0.179022        all                Mitochondria   
10   0.695179  0.659999  0.347589        all                   Fertility   
11   0.107705  0.053853  0.948367        all    Congenital heart defects   
12   0.001161  0.999437  0.000581        all           Hereditary cancer   
13   0.620671  0.702030  0.310336        all  Early onset cardiomyopathy   
14   0.479567  0.771062  0.239784        all             Noonan syndrome   
15   0.723714  0.651306  0.361857        all  Primary ciliary dyskinesia   
16   0.501807  0.250904  0.760216        all     Developmental disorders   
17   0.671421  0.335710  0.677080        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign  n_tot  \
0   0.906575  0.033911               Neurogenetics      196       24    220   
1   0.833157  0.000000                 Amyloidosis       88       14    102   
2   0.895545  0.070944              Cardiovascular    23299     4747  28046   
3   0.936085  0.033884    Primary Immunodeficiency      119       62    181   
4   0.895098  0.022762                        Skin      897      233   1130   
5   0.902730  0.040490                    Epilepsy       73        5     78   
6   0.925397  0.000000                  Angioedema        0        0      0   
7   0.909729  0.035563                   Metabolic     4900     2305   7205   
8   0.948570  0.000000    Hyper-/ hypophosphatemia        0        0      0   
9   0.944768  0.000000                Mitochondria      135       50    185   
10  0.882407  0.108342                   Fertility        0        0      0   
11  0.889803  0.013039    Congenital heart defects     4739     1525   6264   
12  0.936681  0.031690           Hereditary cancer     1096      760   1856   
13  0.935017  0.000000  Early onset cardiomyopathy    14628     3314  17942   
14  0.938973  0.000000             Noonan syndrome     2005      934   2939   
15  0.928358  0.000000  Primary ciliary dyskinesia        0        0      0   
16  0.900367  0.000000     Developmental disorders     3920     1485   5405   
17  0.909191  0.000000           Leukemia-Lymphoma       58       61    119   

   n_train  
0      220  
1      102  
2    28046  
3      181  
4     1130  
5       78  
6        0  
7     7205  
8        0  
9      185  
10       0  
11    6264  
12    1856  
13   17942  
14    2939  
15       0  
16    5405  
17     119  
The mean of the M-W analysis AUC: 0.9121361395964642

Dyslipid | Default hyper | balanced

Back

In [21]:
full_auc_analysis(
    curr_setup = 'Dyslipid, Default Hyper, balanced ds',
    train_loc = './test_output/model_2_0/default_hyper/dyslipid/balanced/dyslipid_balanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/dyslipid/balanced/dyslipid_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_dyslipid_balanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/dyslipid/balanced/train_balanced_dataset.tsv.gz',
    filter_out='./datafiles/dyslipid.txt.gz'
)
There are 1530 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.6635443999981026
AUC analysis of the testing dataset reveals AUC: 0.5742350055930735
File ./not_saving_directory/auc_analysis_dyslipid_balanced.csv not found, creating.
I am stilling running, done 4%
I am stilling running, done 9%
I am stilling running, done 14%
I am stilling running, done 19%
I am stilling running, done 23%
I am stilling running, done 28%
I am stilling running, done 33%
I am stilling running, done 38%
I am stilling running, done 42%
I am stilling running, done 47%
I am stilling running, done 52%
I am stilling running, done 57%
I am stilling running, done 62%
I am stilling running, done 67%
I am stilling running, done 72%
I am stilling running, done 77%
I am stilling running, done 83%
I am stilling running, done 88%
I am stilling running, done 94%
I am stilling running, done 99%
Top 10 worst performing genes: 
        gene  auc        f1  recall  fpr  precision n_benign n_malign n_tot  \
484     SNCA  0.0  0.060606     1.0  0.0   0.031250       31        1    32   
2202    PBX1  0.0  0.666667     1.0  0.0   0.500000        1        1     2   
1012  RNF170  0.0  0.071429     1.0  0.0   0.037037       26        1    27   
1919  TWIST2  0.0  0.666667     1.0  0.0   0.500000        1        1     2   
1998   TENM4  0.0  0.066667     1.0  0.0   0.034483       28        1    29   
980    CDK10  0.0  0.800000     1.0  0.0   0.666667        1        2     3   
552     GLUL  0.0  0.100000     1.0  0.0   0.052632       18        1    19   
1705    SSR4  0.0  0.285714     1.0  0.0   0.166667        5        1     6   
1615    FAR1  0.0  0.666667     1.0  0.0   0.500000        1        1     2   
934     ARV1  0.0  0.500000     1.0  0.0   0.333333        2        1     3   

     n_train n_test  
484       32      0  
2202       2      0  
1012      27      0  
1919       2      0  
1998      28      1  
980        3      0  
552       19      0  
1705       6      0  
1615       2      0  
934        3      0  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.038102  0.019051  0.981430        all               Neurogenetics   
1    0.273147  0.871013  0.136573        all                 Amyloidosis   
2    0.001140  0.999449  0.000570        all              Cardiovascular   
3    0.205850  0.102925  0.899382        all    Primary Immunodeficiency   
4    0.581334  0.290667  0.713763        all                        Skin   
5    0.377093  0.188547  0.814366        all                    Epilepsy   
6    0.645841  0.689664  0.322920        all                  Angioedema   
7    0.489404  0.760904  0.244702        all                   Metabolic   
8    0.288920  0.144460  0.863427        all    Hyper-/ hypophosphatemia   
9    0.832030  0.416015  0.597720        all                Mitochondria   
10   0.132202  0.936507  0.066101        all                   Fertility   
11   0.741425  0.637052  0.370712        all    Congenital heart defects   
12   0.874838  0.565934  0.437419        all           Hereditary cancer   
13   0.915543  0.457772  0.556223        all  Early onset cardiomyopathy   
14   0.273147  0.136573  0.871013        all             Noonan syndrome   
15   0.971801  0.485900  0.528182        all  Primary ciliary dyskinesia   
16   1.000000  0.514100  0.500000        all     Developmental disorders   
17   0.190893  0.095446  0.910411        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign n_tot  \
0   0.618806  0.097035               Neurogenetics       13       14    27   
1   0.722522  0.000000                 Amyloidosis       17        0    17   
2   0.786528  0.133717              Cardiovascular      765      765  1530   
3   0.636262  0.046869    Primary Immunodeficiency        0        0     0   
4   0.658679  0.021834                        Skin        0        0     0   
5   0.636042  0.077654                    Epilepsy        0        0     0   
6   0.672540  0.000000                  Angioedema        0        0     0   
7   0.685471  0.036536                   Metabolic      549      752  1301   
8   0.626876  0.000000    Hyper-/ hypophosphatemia        0        0     0   
9   0.657848  0.000000                Mitochondria        0        0     0   
10  0.753753  0.083603                   Fertility        0        0     0   
11  0.674688  0.025307    Congenital heart defects        0        0     0   
12  0.671705  0.049922           Hereditary cancer        0        0     0   
13  0.659703  0.000000  Early onset cardiomyopathy        0        0     0   
14  0.626599  0.000000             Noonan syndrome        0        0     0   
15  0.660847  0.000000  Primary ciliary dyskinesia        0        0     0   
16  0.662528  0.000000     Developmental disorders        3       12    15   
17  0.610505  0.000000           Leukemia-Lymphoma        0        0     0   

   n_train  
0       27  
1       17  
2     1530  
3        0  
4        0  
5        0  
6        0  
7     1301  
8        0  
9        0  
10       0  
11       0  
12       0  
13       0  
14       0  
15       0  
16      15  
17       0  
The mean of the M-W analysis AUC: 0.6678834222011004

Dyslipid | Default hyper | unbalanced

Back

In [22]:
full_auc_analysis(
    curr_setup = 'Dyslipid, Default Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/default_hyper/dyslipid/unbalanced/dyslipid_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/dyslipid/unbalanced/dyslipid_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_dyslipid_unbalanced.csv',
    training_set_loc='./datafiles/dyslipid.txt.gz',
    filter_out='./datafiles/dyslipid.txt.gz'
)
There are 5029 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.8353792714855658
AUC analysis of the testing dataset reveals AUC: 0.5906942914443721
File ./not_saving_directory/auc_analysis_dyslipid_unbalanced.csv not found, creating.
I am stilling running, done 4%
I am stilling running, done 8%
I am stilling running, done 13%
I am stilling running, done 17%
I am stilling running, done 22%
I am stilling running, done 26%
I am stilling running, done 31%
I am stilling running, done 35%
I am stilling running, done 40%
I am stilling running, done 45%
I am stilling running, done 50%
I am stilling running, done 55%
I am stilling running, done 60%
I am stilling running, done 66%
I am stilling running, done 71%
I am stilling running, done 76%
I am stilling running, done 81%
I am stilling running, done 86%
I am stilling running, done 92%
I am stilling running, done 97%
Top 10 worst performing genes: 
          gene       auc        f1    recall       fpr  precision n_benign  \
2353        F2  0.000000  0.000000  0.000000  1.000000   0.000000        2   
1853      INTU  0.000000  0.666667  0.666667  0.333333   0.666667        1   
2151      FAR1  0.000000  0.666667  1.000000  0.000000   0.500000        1   
1455     ERMAP  0.000000  0.000000  0.000000  1.000000   0.000000       21   
233   KIAA0196  0.000000  0.000000  0.000000  1.000000   0.000000      306   
2351  TRAF3IP1  0.000000  0.000000  0.000000  1.000000   0.000000        3   
2293      SAA1  0.016393  0.000000  0.000000  1.000000   0.000000       61   
1105    UBQLN2  0.030612  0.000000  0.000000  1.000000   0.000000       49   
491      ASCC1  0.047619  0.000000  0.000000  1.000000   0.000000       63   
2195     GRIA4  0.071429  0.000000  0.000000  1.000000   0.000000       14   

     n_malign n_tot n_train n_test  
2353        1     3       3      0  
1853        3     4       4      0  
2151        1     2       2      0  
1455        1    22      18      4  
233         1   307     299      8  
2351        1     4       4      0  
2293        1    62      61      1  
1105        1    50      50      0  
491         1    64      56      8  
2195        1    15      15      0  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.667612  0.333806  0.669996        all               Neurogenetics   
1    0.943635  0.542230  0.471818        all                 Amyloidosis   
2    0.003755  0.998179  0.001877        all              Cardiovascular   
3    0.922476  0.461238  0.543911        all    Primary Immunodeficiency   
4    0.005171  0.002585  0.997517        all                        Skin   
5    0.015890  0.007945  0.992288        all                    Epilepsy   
6    0.620661  0.702036  0.310331        all                  Angioedema   
7    0.686217  0.343108  0.663469        all                   Metabolic   
8    0.358030  0.830078  0.179015        all    Hyper-/ hypophosphatemia   
9    0.479555  0.771068  0.239777        all                Mitochondria   
10   0.950666  0.475333  0.532873        all                   Fertility   
11   0.201016  0.100508  0.903077        all    Congenital heart defects   
12   0.019425  0.990507  0.009712        all           Hereditary cancer   
13   0.777324  0.624818  0.388662        all  Early onset cardiomyopathy   
14   0.119841  0.059920  0.944172        all             Noonan syndrome   
15   0.887551  0.570149  0.443775        all  Primary ciliary dyskinesia   
16   0.571655  0.285828  0.726069        all     Developmental disorders   
17   0.595929  0.297964  0.714172        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign n_tot  \
0   0.860758  0.050208               Neurogenetics      194       26   220   
1   0.871059  0.000000                 Amyloidosis       58        3    61   
2   0.932290  0.074335              Cardiovascular     3913     1116  5029   
3   0.873093  0.050234    Primary Immunodeficiency        0        0     0   
4   0.832898  0.022939                        Skin        0        0     0   
5   0.830522  0.074652                    Epilepsy        0        0     0   
6   0.887302  0.000000                  Angioedema        0        0     0   
7   0.858713  0.032595                   Metabolic     2670     1094  3764   
8   0.914752  0.000000    Hyper-/ hypophosphatemia        0        0     0   
9   0.902504  0.000000                Mitochondria        0        0     0   
10  0.825962  0.114641                   Fertility        0        0     0   
11  0.843331  0.012709    Congenital heart defects        0        0     0   
12  0.892580  0.049230           Hereditary cancer        0        0     0   
13  0.878267  0.000000  Early onset cardiomyopathy        0        0     0   
14  0.781220  0.000000             Noonan syndrome        0        0     0   
15  0.873619  0.000000  Primary ciliary dyskinesia        0        0     0   
16  0.851944  0.000000     Developmental disorders       40       24    64   
17  0.852123  0.000000           Leukemia-Lymphoma        0        0     0   

   n_train  
0      220  
1       61  
2     5029  
3        0  
4        0  
5        0  
6        0  
7     3764  
8        0  
9        0  
10       0  
11       0  
12       0  
13       0  
14       0  
15       0  
16      64  
17       0  
The mean of the M-W analysis AUC: 0.864607625020695

Hereditary cancer | Default hyper | balanced

Back

In [4]:
full_auc_analysis(
    curr_setup = 'Hereditary cancer, Default Hyper, balanced ds',
    train_loc = './test_output/model_2_0/default_hyper/ek/balanced/ek_balanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/ek/balanced/ek_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_hc_balanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/ek/balanced/train_balanced_dataset.tsv.gz',
    filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
There are 2766 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.7882081912133676
AUC analysis of the testing dataset reveals AUC: 0.7246651191243165
File ./not_saving_directory/auc_analysis_hc_balanced.csv not found, creating.
I am stilling running, done 4%
I am stilling running, done 9%
I am stilling running, done 14%
I am stilling running, done 18%
I am stilling running, done 23%
I am stilling running, done 28%
I am stilling running, done 33%
I am stilling running, done 38%
I am stilling running, done 43%
I am stilling running, done 49%
I am stilling running, done 54%
I am stilling running, done 60%
I am stilling running, done 66%
I am stilling running, done 71%
I am stilling running, done 77%
I am stilling running, done 83%
I am stilling running, done 89%
I am stilling running, done 95%
Top 10 worst performing genes: 
          gene  auc        f1  recall  fpr  precision n_benign n_malign n_tot  \
2342       AVP  0.0  0.000000     0.0  1.0   0.000000        1        1     2   
778      ERMAP  0.0  0.000000     0.0  1.0   0.000000       21        1    22   
766       ANK3  0.0  0.000000     0.0  1.0   0.000000      110        1   111   
701       PIGW  0.0  0.857143     1.0  0.0   0.750000        1        3     4   
2315      EFHB  0.0  0.666667     1.0  0.0   0.500000        1        1     2   
569     SLC6A9  0.0  0.666667     1.0  0.0   0.500000        2        2     4   
1708      FAR1  0.0  0.666667     1.0  0.0   0.500000        1        1     2   
2116        F2  0.0  0.500000     1.0  0.0   0.333333        2        1     3   
1824  HIST1H1E  0.0  0.285714     1.0  0.0   0.166667        5        1     6   
2043    MRPS34  0.0  0.800000     1.0  0.0   0.666667        1        2     3   

     n_train n_test  
2342       2      0  
778       18      4  
766      106      5  
701        4      0  
2315       2      0  
569        4      0  
1708       2      0  
2116       3      0  
1824       5      1  
2043       3      0  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.883425  0.562418  0.441712        all               Neurogenetics   
1    0.943637  0.471818  0.542228        all                 Amyloidosis   
2    0.011012  0.005506  0.994644        all              Cardiovascular   
3    0.503999  0.752123  0.251999        all    Primary Immunodeficiency   
4    0.563678  0.722526  0.281839        all                        Skin   
5    0.798967  0.399484  0.604696        all                    Epilepsy   
6    0.119853  0.944166  0.059926        all                  Angioedema   
7    0.739766  0.636875  0.369883        all                   Metabolic   
8    0.396218  0.811583  0.198109        all    Hyper-/ hypophosphatemia   
9    0.305295  0.855540  0.152647        all                Mitochondria   
10   0.409418  0.204709  0.801095        all                   Fertility   
11   0.167052  0.083526  0.919596        all    Congenital heart defects   
12   0.074461  0.963456  0.037231        all           Hereditary cancer   
13   0.671422  0.677080  0.335711        all  Early onset cardiomyopathy   
14   0.339858  0.169929  0.838863        all             Noonan syndrome   
15   0.436748  0.791904  0.218374        all  Primary ciliary dyskinesia   
16   0.804561  0.402280  0.611334        all     Developmental disorders   
17   0.168005  0.084002  0.921318        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign n_tot  \
0   0.826700  0.048631               Neurogenetics       41       62   103   
1   0.833150  0.000000                 Amyloidosis        0        0     0   
2   0.797463  0.077504              Cardiovascular      244      153   397   
3   0.839485  0.045524    Primary Immunodeficiency       60       75   135   
4   0.842605  0.033515                        Skin       66       24    90   
5   0.826638  0.045358                    Epilepsy        1        0     1   
6   0.939048  0.000000                  Angioedema        0        0     0   
7   0.817617  0.078667                   Metabolic      155      448   603   
8   0.860594  0.000000    Hyper-/ hypophosphatemia        0        0     0   
9   0.874638  0.000000                Mitochondria       21       67    88   
10  0.788918  0.077662                   Fertility        0        0     0   
11  0.807323  0.014826    Congenital heart defects      305      192   497   
12  0.837326  0.095158           Hereditary cancer     1383     1383  2766   
13  0.850903  0.000000  Early onset cardiomyopathy      264      290   554   
14  0.794604  0.000000             Noonan syndrome      222      130   352   
15  0.856396  0.000000  Primary ciliary dyskinesia        0        0     0   
16  0.830447  0.000000     Developmental disorders      417      384   801   
17  0.733199  0.000000           Leukemia-Lymphoma       45       15    60   

   n_train  
0      103  
1        0  
2      397  
3      135  
4       90  
5        1  
6        0  
7      603  
8        0  
9       88  
10       0  
11     497  
12    2766  
13     554  
14     352  
15       0  
16     801  
17      60  
The mean of the M-W analysis AUC: 0.8309474082954248

Hereditary cancer | Default hyper | unbalanced

Back

In [6]:
full_auc_analysis(
    curr_setup = 'Hereditary cancer, Default Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/default_hyper/ek/unbalanced/ek_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/ek/unbalanced/ek_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_hc_unbalanced.csv',
    training_set_loc='./datafiles/hereditarycancer.txt.gz',
    filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
There are 26776 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.9241746446715966
AUC analysis of the testing dataset reveals AUC: 0.6860944856987427
File ./not_saving_directory/auc_analysis_hc_unbalanced.csv not found, creating.
I am stilling running, done 4%
I am stilling running, done 9%
I am stilling running, done 13%
I am stilling running, done 18%
I am stilling running, done 23%
I am stilling running, done 28%
I am stilling running, done 33%
I am stilling running, done 39%
I am stilling running, done 44%
I am stilling running, done 50%
I am stilling running, done 56%
I am stilling running, done 62%
I am stilling running, done 68%
I am stilling running, done 73%
I am stilling running, done 79%
I am stilling running, done 85%
I am stilling running, done 91%
I am stilling running, done 97%
Top 10 worst performing genes: 
          gene       auc        f1  recall  fpr  precision n_benign n_malign  \
2275     CDK10  0.000000  0.500000     0.5  0.5   0.500000        1        2   
229      ASCC1  0.031746  0.000000     0.0  1.0   0.000000       63        1   
1153    ABCA13  0.076923  0.027027     1.0  0.0   0.013699       78        1   
1845      SAA1  0.098361  0.000000     0.0  1.0   0.000000       61        1   
489    C12orf4  0.136364  0.090909     1.0  0.0   0.047619       22        1   
2097    UBQLN2  0.142857  0.000000     0.0  1.0   0.000000       49        1   
79    ARHGAP29  0.166667  0.428571     1.0  0.0   0.272727        8        3   
805      ERMAP  0.190476  0.000000     0.0  1.0   0.000000       21        1   
172   KIAA0196  0.202614  0.000000     0.0  1.0   0.000000      306        1   
1784    CELSR2  0.237136  0.000000     0.0  1.0   0.000000      894        1   

     n_tot n_train n_test  
2275     3       3      0  
229     64      56      8  
1153    79      51     28  
1845    62      61      1  
489     23      18      5  
2097    50      50      0  
79      11      10      1  
805     22      18      4  
172    307     299      8  
1784   895     888      7  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.564586  0.282293  0.721236        all               Neurogenetics   
1    0.203162  0.101581  0.904554        all                 Amyloidosis   
2    0.315168  0.157584  0.844720        all              Cardiovascular   
3    0.137376  0.933013  0.068688        all    Primary Immunodeficiency   
4    0.090411  0.045205  0.956016        all                        Skin   
5    0.365497  0.182749  0.820108        all                    Epilepsy   
6    0.190893  0.910411  0.095446        all                  Angioedema   
7    0.964197  0.482099  0.525054        all                   Metabolic   
8    0.305295  0.855540  0.152647        all    Hyper-/ hypophosphatemia   
9    0.376834  0.820977  0.188417        all                Mitochondria   
10   0.664955  0.674979  0.332478        all                   Fertility   
11   0.173478  0.086739  0.916474        all    Congenital heart defects   
12   0.006865  0.996654  0.003432        all           Hereditary cancer   
13   0.804561  0.611334  0.402280        all  Early onset cardiomyopathy   
14   0.119853  0.059926  0.944166        all             Noonan syndrome   
15   0.339858  0.838863  0.169929        all  Primary ciliary dyskinesia   
16   0.524583  0.262292  0.749096        all     Developmental disorders   
17   0.190893  0.095446  0.910411        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign  n_tot  \
0   0.930029  0.033143               Neurogenetics     1090      573   1663   
1   0.889414  0.000000                 Amyloidosis        0        0      0   
2   0.929832  0.034947              Cardiovascular     1381      943   2324   
3   0.952086  0.040721    Primary Immunodeficiency     1411      590   2001   
4   0.917868  0.029784                        Skin      822       42    864   
5   0.928969  0.027688                    Epilepsy       12        2     14   
6   0.975556  0.000000                  Angioedema        0        0      0   
7   0.922668  0.053665                   Metabolic     2922     2215   5137   
8   0.966318  0.000000    Hyper-/ hypophosphatemia        0        0      0   
9   0.962532  0.000000                Mitochondria      411      255    666   
10  0.936928  0.050237                   Fertility        0        0      0   
11  0.911730  0.017069    Congenital heart defects     2324     1037   3361   
12  0.948217  0.041597           Hereditary cancer    16869     9907  26776   
13  0.947904  0.000000  Early onset cardiomyopathy     3682     3126   6808   
14  0.853254  0.000000             Noonan syndrome     1081      815   1896   
15  0.965604  0.000000  Primary ciliary dyskinesia        0        0      0   
16  0.923591  0.000000     Developmental disorders     3999     2044   6043   
17  0.889348  0.000000           Leukemia-Lymphoma      466       29    495   

   n_train  
0     1663  
1        0  
2     2324  
3     2001  
4      864  
5       14  
6        0  
7     5137  
8        0  
9      666  
10       0  
11    3361  
12   26776  
13    6808  
14    1896  
15       0  
16    6043  
17     495  
The mean of the M-W analysis AUC: 0.9306581071783884

Neurogenetics | Default hyper | balanced

Back

In [7]:
full_auc_analysis(
    curr_setup = 'Neurogenetics, Default Hyper, balanced ds',
    train_loc = './test_output/model_2_0/default_hyper/neuro/balanced/neuro_balanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/neuro/balanced/neuro_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_neuro_balanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/neuro/balanced/train_balanced_dataset.tsv.gz',
    filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
There are 5804 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.8013512248440763
AUC analysis of the testing dataset reveals AUC: 0.7297721143675924
File ./not_saving_directory/auc_analysis_neuro_balanced.csv not found, creating.
I am stilling running, done 5%
I am stilling running, done 9%
I am stilling running, done 14%
I am stilling running, done 19%
I am stilling running, done 24%
I am stilling running, done 29%
I am stilling running, done 34%
I am stilling running, done 39%
I am stilling running, done 45%
I am stilling running, done 50%
I am stilling running, done 56%
I am stilling running, done 62%
I am stilling running, done 68%
I am stilling running, done 73%
I am stilling running, done 79%
I am stilling running, done 85%
I am stilling running, done 91%
I am stilling running, done 97%
Top 10 worst performing genes: 
          gene  auc        f1  recall  fpr  precision n_benign n_malign n_tot  \
969       PIGW  0.0  0.857143     1.0  0.0   0.750000        1        3     4   
1997      PBX1  0.0  0.666667     1.0  0.0   0.500000        1        1     2   
2201  TRAF3IP1  0.0  0.000000     0.0  1.0   0.000000        3        1     4   
944       SMPX  0.0  0.285714     1.0  0.0   0.166667        5        1     6   
1615      FAR1  0.0  0.666667     1.0  0.0   0.500000        1        1     2   
475     SLC6A9  0.0  0.666667     1.0  0.0   0.500000        2        2     4   
1328  HIST1H1E  0.0  0.285714     1.0  0.0   0.166667        5        1     6   
1821     ERMAP  0.0  0.000000     0.0  1.0   0.000000       21        1    22   
1772      LIPE  0.0  0.500000     1.0  0.0   0.333333        2        1     3   
2150   RNASET2  0.0  0.000000     0.0  1.0   0.000000       17        1    18   

     n_train n_test  
969        4      0  
1997       2      0  
2201       4      0  
944        5      1  
1615       2      0  
475        4      0  
1328       5      1  
1821      18      4  
1772       3      0  
2150      18      0  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.012309  0.994025  0.006154        all               Neurogenetics   
1    0.229405  0.892002  0.114702        all                 Amyloidosis   
2    0.292572  0.146286  0.855907        all              Cardiovascular   
3    0.140843  0.931312  0.070421        all    Primary Immunodeficiency   
4    0.850769  0.579695  0.425385        all                        Skin   
5    0.214641  0.894668  0.107320        all                    Epilepsy   
6    0.119853  0.059926  0.944166        all                  Angioedema   
7    0.907093  0.553560  0.453547        all                   Metabolic   
8    0.257973  0.878303  0.128987        all    Hyper-/ hypophosphatemia   
9    0.179177  0.915998  0.089589        all                Mitochondria   
10   0.223698  0.111849  0.892026        all                   Fertility   
11   0.143131  0.071566  0.931208        all    Congenital heart defects   
12   0.038148  0.019074  0.981318        all           Hereditary cancer   
13   0.620673  0.702030  0.310336        all  Early onset cardiomyopathy   
14   0.168005  0.084002  0.921318        all             Noonan syndrome   
15   0.436748  0.218374  0.791904        all  Primary ciliary dyskinesia   
16   0.832030  0.416015  0.597720        all     Developmental disorders   
17   0.671422  0.335711  0.677080        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign n_tot  \
0   0.828391  0.153392               Neurogenetics     2902     2902  5804   
1   0.885940  0.000000                 Amyloidosis        0        0     0   
2   0.801173  0.107130              Cardiovascular       14       13    27   
3   0.861268  0.038192    Primary Immunodeficiency       74      249   323   
4   0.840010  0.015293                        Skin       33       55    88   
5   0.820189  0.120639                    Epilepsy      451     1125  1576   
6   0.689206  0.000000                  Angioedema        0        0     0   
7   0.823705  0.059821                   Metabolic     1190     2318  3508   
8   0.880364  0.000000    Hyper-/ hypophosphatemia        1       44    45   
9   0.892589  0.000000                Mitochondria      250      359   609   
10  0.806290  0.033958                   Fertility        1       44    45   
11  0.808927  0.012534    Congenital heart defects      120       94   214   
12  0.811513  0.072916           Hereditary cancer       31      165   196   
13  0.853127  0.000000  Early onset cardiomyopathy      390      208   598   
14  0.767962  0.000000             Noonan syndrome        0        0     0   
15  0.813583  0.000000  Primary ciliary dyskinesia        0        0     0   
16  0.830385  0.000000     Developmental disorders     1445     2427  3872   
17  0.826277  0.000000           Leukemia-Lymphoma       12        5    17   

   n_train  
0     5804  
1        0  
2       27  
3      323  
4       88  
5     1576  
6        0  
7     3508  
8       45  
9      609  
10      45  
11     214  
12     196  
13     598  
14       0  
15       0  
16    3872  
17      17  
The mean of the M-W analysis AUC: 0.8244943388331822

Neurogenetics | Default hyper | unbalanced

Back

In [9]:
full_auc_analysis(
    curr_setup = 'Neurogenetics, Default Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/default_hyper/neuro/unbalanced/neuro_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/default_hyper/neuro/unbalanced/neuro_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_neuro_unbalanced.csv',
    training_set_loc='./datafiles/neurogenetics.txt.gz',
    filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
There are 29769 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.9381871874633931
AUC analysis of the testing dataset reveals AUC: 0.6880576676116006
File ./not_saving_directory/auc_analysis_neuro_unbalanced.csv not found, creating.
I am stilling running, done 4%
I am stilling running, done 9%
I am stilling running, done 13%
I am stilling running, done 18%
I am stilling running, done 23%
I am stilling running, done 28%
I am stilling running, done 33%
I am stilling running, done 38%
I am stilling running, done 44%
I am stilling running, done 50%
I am stilling running, done 56%
I am stilling running, done 61%
I am stilling running, done 67%
I am stilling running, done 73%
I am stilling running, done 79%
I am stilling running, done 85%
I am stilling running, done 91%
I am stilling running, done 96%
Top 10 worst performing genes: 
          gene       auc        f1  recall  fpr  precision n_benign n_malign  \
2241    UBQLN2  0.000000  0.000000     0.0  1.0   0.000000        6        1   
2239  TRAF3IP1  0.000000  0.000000     0.0  1.0   0.000000        3        1   
11       ASCC1  0.015873  0.000000     0.0  1.0   0.000000       63        1   
1471      SAA1  0.016393  0.000000     0.0  1.0   0.000000       61        1   
1078     KITLG  0.095238  0.000000     0.0  1.0   0.000000       21        1   
2164       TNC  0.095406  0.000000     0.0  1.0   0.000000      283        1   
1251       CA4  0.183007  0.000000     0.0  1.0   0.000000      153        1   
574      ERMAP  0.190476  0.000000     0.0  1.0   0.000000       21        1   
922   HIST1H1E  0.200000  0.333333     1.0  0.0   0.200000        5        1   
820      PTHLH  0.200000  0.285714     1.0  0.0   0.166667        5        1   

     n_tot n_train n_test  
2241     7       7      0  
2239     4       4      0  
11      64      56      8  
1471    62      61      1  
1078    22      22      0  
2164   284     278      6  
1251   154     153      1  
574     22      18      4  
922      6       5      1  
820      6       4      2  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.240783  0.881696  0.120391        all               Neurogenetics   
1    0.376834  0.188417  0.820977        all                 Amyloidosis   
2    0.148014  0.074007  0.927330        all              Cardiovascular   
3    0.167022  0.918464  0.083511        all    Primary Immunodeficiency   
4    0.109065  0.054532  0.946886        all                        Skin   
5    0.151007  0.075504  0.926026        all                    Epilepsy   
6    0.229405  0.114702  0.892002        all                  Angioedema   
7    0.596344  0.298172  0.708023        all                   Metabolic   
8    0.436748  0.791904  0.218374        all    Hyper-/ hypophosphatemia   
9    0.595941  0.714166  0.297970        all                Mitochondria   
10   0.186879  0.909957  0.093440        all                   Fertility   
11   0.137583  0.068792  0.933899        all    Congenital heart defects   
12   0.004518  0.997801  0.002259        all           Hereditary cancer   
13   1.000000  0.514100  0.500000        all  Early onset cardiomyopathy   
14   0.128500  0.064250  0.940074        all             Noonan syndrome   
15   0.571668  0.726062  0.285834        all  Primary ciliary dyskinesia   
16   0.501809  0.250904  0.760216        all     Developmental disorders   
17   0.168005  0.084002  0.921318        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign  n_tot  \
0   0.950007  0.040521               Neurogenetics    23659     6110  29769   
1   0.922235  0.000000                 Amyloidosis        0        0      0   
2   0.913033  0.073980              Cardiovascular      241       30    271   
3   0.958281  0.035023    Primary Immunodeficiency     1593      725   2318   
4   0.930762  0.018541                        Skin      283      100    383   
5   0.923708  0.055937                    Epilepsy     5563     2235   7798   
6   0.907302  0.000000                  Angioedema        0        0      0   
7   0.922385  0.055332                   Metabolic    13038     4790  17828   
8   0.972489  0.000000    Hyper-/ hypophosphatemia       24       76    100   
9   0.962787  0.000000                Mitochondria     2435      736   3171   
10  0.969860  0.014169                   Fertility       24       76    100   
11  0.919446  0.016597    Congenital heart defects     1060      246   1306   
12  0.961865  0.030144           Hereditary cancer     1090      573   1663   
13  0.949048  0.000000  Early onset cardiomyopathy     2162      406   2568   
14  0.858456  0.000000             Noonan syndrome        0        0      0   
15  0.963416  0.000000  Primary ciliary dyskinesia        0        0      0   
16  0.927865  0.000000     Developmental disorders    13065     4876  17941   
17  0.896653  0.000000           Leukemia-Lymphoma       49        8     57   

   n_train  
0    29769  
1        0  
2      271  
3     2318  
4      383  
5     7798  
6        0  
7    17828  
8      100  
9     3171  
10     100  
11    1306  
12    1663  
13    2568  
14       0  
15       0  
16   17941  
17      57  
The mean of the M-W analysis AUC: 0.9338664649263472

Cardiovascular | Optimal hyper | balanced

Back

In [6]:
full_auc_analysis(
    curr_setup = 'Cardiovascular, Optimal Hyper, balanced ds',
    train_loc = './test_output/model_2_0/random_hyper/cardiovascular/balanced/cardiovascular_balanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/cardiovascular/balanced/cardiovascular_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_cardio_balanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/cardiovascular/balanced/train_balanced_dataset.tsv.gz',
    model='./test_output/model_2_0/random_hyper/cardiovascular/balanced/xgb_optimal_model.pickle.dat',
    filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
Parameter learning_rate is set to 0.07549633411939431
Parameter n_estimators is set to 460
Parameter max_depth is set to 14
There are 6642 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.7603057057313227
AUC analysis of the testing dataset reveals AUC: 0.6703142530460753
File ./not_saving_directory/auc_analysis_randomhyper_cardio_balanced.csv found. Loading.
Top 10 worst performing genes: 
          gene  auc        f1  recall  fpr  precision  n_benign  n_malign  \
1977      INTU  0.0  0.857143     1.0  0.0   0.750000         1         3   
2318       PTH  0.0  0.000000     0.0  1.0   0.000000         3         1   
1716      FAR1  0.0  0.666667     1.0  0.0   0.500000         1         1   
2288     TEX15  0.0  0.000000     0.0  1.0   0.000000         3         1   
2112    SLC6A9  0.0  0.666667     1.0  0.0   0.500000         2         2   
1549      CSTA  0.0  0.666667     1.0  0.0   0.500000         1         1   
805      CALM3  0.0  0.571429     1.0  0.0   0.400000         3         2   
1483      PIGW  0.0  0.857143     1.0  0.0   0.750000         1         3   
2026  HIST1H1E  0.0  0.285714     1.0  0.0   0.166667         5         1   
1681     GRID2  0.0  0.285714     1.0  0.0   0.166667         5         1   

      n_tot  n_train  n_test  
1977      4        4       0  
2318      4        4       0  
1716      2        2       0  
2288      4        4       0  
2112      4        4       0  
1549      2        2       0  
805       5        5       0  
1483      4        4       0  
2026      6        5       1  
1681      6        6       0  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.489402  0.244701  0.758577        all               Neurogenetics   
1    0.376828  0.188414  0.820980        all                 Amyloidosis   
2    0.600382  0.703144  0.300191        all              Cardiovascular   
3    0.187846  0.908233  0.093923        all    Primary Immunodeficiency   
4    0.673250  0.668098  0.336625        all                        Skin   
5    0.444831  0.222416  0.780800        all                    Epilepsy   
6    0.119848  0.944168  0.059924        all                  Angioedema   
7    0.935603  0.467802  0.539333        all                   Metabolic   
8    0.190887  0.910414  0.095443        all    Hyper-/ hypophosphatemia   
9    0.243389  0.885301  0.121694        all                Mitochondria   
10   0.563636  0.281818  0.725104        all               Preconception   
11   0.885214  0.442607  0.565522        all    Congenital heart defects   
12   0.200041  0.100020  0.901466        all           Hereditary cancer   
13   0.571663  0.726065  0.285831        all  Early onset cardiomyopathy   
14   0.157359  0.926382  0.078679        all             Noonan syndrome   
15   0.595936  0.297968  0.714169        all  Primary ciliary dyskinesia   
16   1.000000  0.500000  0.514100        all     Developmental disorders   
17   0.777329  0.624815  0.388664        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho n_tot  \
0   0.780924  0.033015               Neurogenetics        7      21    28   
1   0.754627  0.000000                 Amyloidosis       13      13    26   
2   0.782253  0.164365              Cardiovascular     3321    3321  6642   
3   0.820509  0.055584    Primary Immunodeficiency       12      57    69   
4   0.794235  0.031730                        Skin       83     187   270   
5   0.757884  0.089280                    Epilepsy       15       5    20   
6   0.935556  0.000000                  Angioedema        0       0     0   
7   0.780093  0.040585                   Metabolic      546    1590  2136   
8   0.857969  0.000000    Hyper-/ hypophosphatemia        0       0     0   
9   0.836868  0.000000                Mitochondria        6      40    46   
10  0.674702  0.172759               Preconception        0       0     0   
11  0.784044  0.003214    Congenital heart defects      758    1104  1862   
12  0.776781  0.049743           Hereditary cancer      182     447   629   
13  0.804497  0.000000  Early onset cardiomyopathy     1958    2220  4178   
14  0.884845  0.000000             Noonan syndrome      326     649   975   
15  0.772707  0.000000  Primary ciliary dyskinesia        0       0     0   
16  0.785861  0.000000     Developmental disorders      572     995  1567   
17  0.794875  0.000000           Leukemia-Lymphoma        3      57    60   

   n_train  
0       28  
1       26  
2     6642  
3       69  
4      270  
5       20  
6        0  
7     2136  
8        0  
9       46  
10       0  
11    1862  
12     629  
13    4178  
14     975  
15       0  
16    1567  
17      60  
The mean of the M-W analysis AUC: 0.7988459948007232

Cardiovascular | Optimal hyper | unbalanced

Back

In [4]:
full_auc_analysis(
    curr_setup = 'Cardiovascular, Optimal Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/random_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_cardio_unbalanced.csv',
    training_set_loc='./datafiles/cardiovascular.txt.gz',
    model='./test_output/model_2_0/random_hyper/cardiovascular/unbalanced/xgb_optimal_model.pickle.dat',
    filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
Parameter learning_rate is set to 0.1467122152765178
Parameter n_estimators is set to 222
Parameter max_depth is set to 9
There are 35058 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.8960473563876927
AUC analysis of the testing dataset reveals AUC: 0.6551261140846805
File ./not_saving_directory/auc_analysis_randomhyper_cardio_unbalanced.csv found. Loading.
Top 10 worst performing genes: 
          gene       auc        f1  recall  fpr  precision  n_benign  \
2172      SAA1  0.000000  0.000000     0.0  1.0   0.000000        61   
1993      FAR1  0.000000  0.666667     1.0  0.0   0.500000         1   
1510     PTHLH  0.000000  0.285714     1.0  0.0   0.166667         5   
2277     CDK10  0.000000  0.500000     0.5  0.5   0.500000         1   
2282  TRAF3IP1  0.000000  0.000000     0.0  1.0   0.000000         3   
2195    MRPS34  0.000000  0.800000     1.0  0.0   0.666667         1   
287      ASCC1  0.015873  0.000000     0.0  1.0   0.000000        63   
1766    UBQLN2  0.040816  0.000000     0.0  1.0   0.000000        49   
888   KIAA0196  0.078431  0.000000     0.0  1.0   0.000000       306   
1354     ERMAP  0.095238  0.000000     0.0  1.0   0.000000        21   

      n_malign  n_tot  n_train  n_test  
2172         1     62       61       1  
1993         1      2        2       0  
1510         1      6        4       2  
2277         2      3        3       0  
2282         1      4        4       0  
2195         2      3        3       0  
287          1     64       56       8  
1766         1     50       50       0  
888          1    307      299       8  
1354         1     22       18       4  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.384681  0.192341  0.810510        all               Neurogenetics   
1    0.137624  0.068812  0.935750        all                 Amyloidosis   
2    0.762062  0.381031  0.622626        all              Cardiovascular   
3    0.070293  0.965848  0.035147        all    Primary Immunodeficiency   
4    0.049331  0.024666  0.976075        all                        Skin   
5    0.177245  0.088623  0.913105        all                    Epilepsy   
6    0.671421  0.677080  0.335710        all                  Angioedema   
7    0.739766  0.369883  0.636875        all                   Metabolic   
8    0.243393  0.885298  0.121697        all    Hyper-/ hypophosphatemia   
9    0.723714  0.651306  0.361857        all                Mitochondria   
10   0.852753  0.581694  0.426376        all                   Fertility   
11   0.103267  0.051633  0.950512        all    Congenital heart defects   
12   0.004169  0.997971  0.002084        all           Hereditary cancer   
13   0.804560  0.611335  0.402280        all  Early onset cardiomyopathy   
14   0.376832  0.820978  0.188416        all             Noonan syndrome   
15   0.971801  0.528182  0.485900        all  Primary ciliary dyskinesia   
16   0.524582  0.262291  0.749096        all     Developmental disorders   
17   0.376832  0.188416  0.820978        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign  n_tot  \
0   0.905505  0.037390               Neurogenetics      241       30    271   
1   0.829961  0.000000                 Amyloidosis      117       21    138   
2   0.901250  0.064885              Cardiovascular    29144     5914  35058   
3   0.938256  0.030902    Primary Immunodeficiency      147       78    225   
4   0.892325  0.023660                        Skin     1093      299   1392   
5   0.893090  0.057048                    Epilepsy       91        7     98   
6   0.932063  0.000000                  Angioedema        0        0      0   
7   0.904427  0.036595                   Metabolic     6111     2871   8982   
8   0.952345  0.000000    Hyper-/ hypophosphatemia        0        0      0   
9   0.929329  0.000000                Mitochondria      161       61    222   
10  0.881263  0.101489                   Fertility        0        0      0   
11  0.886878  0.010888    Congenital heart defects     5939     1912   7851   
12  0.931656  0.035611           Hereditary cancer     1381      943   2324   
13  0.926714  0.000000  Early onset cardiomyopathy    18294     4139  22433   
14  0.944735  0.000000             Noonan syndrome     2524     1172   3696   
15  0.920358  0.000000  Primary ciliary dyskinesia        0        0      0   
16  0.895652  0.000000     Developmental disorders     4922     1844   6766   
17  0.888229  0.000000           Leukemia-Lymphoma       75       78    153   

   n_train  
0      271  
1      138  
2    35058  
3      225  
4     1392  
5       98  
6        0  
7     8982  
8        0  
9      222  
10       0  
11    7851  
12    2324  
13   22433  
14    3696  
15       0  
16    6766  
17     153  
The mean of the M-W analysis AUC: 0.9085575997365426

Dyslipid | Optimal hyper | balanced

Back

In [7]:
full_auc_analysis(
    curr_setup = 'Dyslipid, Optimal Hyper, balanced ds',
    train_loc = './test_output/model_2_0/random_hyper/dyslipid/balanced/dyslipid_balanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/dyslipid/balanced/dyslipid_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_dyslipid_balanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/dyslipid/balanced/train_balanced_dataset.tsv.gz',
    model='./test_output/model_2_0/random_hyper/dyslipid/balanced/xgb_optimal_model.pickle.dat',
    filter_out='./datafiles/dyslipid.txt.gz'
)
Parameter learning_rate is set to 0.1527244637312314
Parameter n_estimators is set to 211
Parameter max_depth is set to 10
There are 1530 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.7188298851914152
AUC analysis of the testing dataset reveals AUC: 0.5654116356142449
File ./not_saving_directory/auc_analysis_randomhyper_dyslipid_balanced.csv found. Loading.
Top 10 worst performing genes: 
        gene  auc        f1  recall  fpr  precision  n_benign  n_patho  n_tot  \
813     ZIC1  0.0  0.500000     1.0  0.0   0.333333         2        1      3   
2026  KISS1R  0.0  0.250000     1.0  0.0   0.142857         6        1      7   
1456  GTF2H5  0.0  0.400000     1.0  0.0   0.250000         3        1      4   
1095   SPG21  0.0  0.666667     1.0  0.0   0.500000         1        1      2   
2337  AKR1D1  0.0  0.000000     0.0  1.0   0.000000         9        2     11   
2339    SORD  0.0  0.500000     1.0  0.0   0.333333         2        1      3   
2257    PBX1  0.0  0.666667     1.0  0.0   0.500000         1        1      2   
485    ASCC1  0.0  0.000000     0.0  1.0   0.000000        63        1     64   
2254    SAA1  0.0  0.000000     0.0  1.0   0.000000        61        1     62   
901    PATL2  0.0  0.750000     1.0  0.0   0.600000         2        3      5   

      n_train  n_test  
813         3       0  
2026        7       0  
1456        4       0  
1095        2       0  
2337       11       0  
2339        2       1  
2257        2       0  
485        56       8  
2254       61       1  
901         5       0  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.378982  0.189491  0.813334        all               Neurogenetics   
1    0.243395  0.885298  0.121697        all                 Amyloidosis   
2    0.019259  0.990616  0.009629        all              Cardiovascular   
3    0.388227  0.194114  0.809434        all    Primary Immunodeficiency   
4    0.256245  0.128123  0.874575        all                        Skin   
5    0.035038  0.017519  0.982944        all                    Epilepsy   
6    0.595941  0.297970  0.714166        all                  Angioedema   
7    0.808479  0.602701  0.404239        all                   Metabolic   
8    0.479569  0.239784  0.771061        all    Hyper-/ hypophosphatemia   
9    0.322274  0.847353  0.161137        all                Mitochondria   
10   0.421227  0.210613  0.795291        all               Preconception   
11   0.804540  0.402270  0.605688        all    Congenital heart defects   
12   0.028966  0.985827  0.014483        all           Hereditary cancer   
13   0.697389  0.664289  0.348694        all  Early onset cardiomyopathy   
14   0.396218  0.198109  0.811583        all             Noonan syndrome   
15   0.257973  0.128987  0.878303        all  Primary ciliary dyskinesia   
16   0.915543  0.556223  0.457772        all     Developmental disorders   
17   0.671422  0.335711  0.677080        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho n_tot  \
0   0.749149  0.084097               Neurogenetics       13      14    27   
1   0.847260  0.000000                 Amyloidosis       17       0    17   
2   0.841598  0.098745              Cardiovascular      765     765  1530   
3   0.752314  0.074077    Primary Immunodeficiency        0       0     0   
4   0.761690  0.036476                        Skin        0       0     0   
5   0.733013  0.073672                    Epilepsy        0       0     0   
6   0.753333  0.000000                  Angioedema        0       0     0   
7   0.781839  0.047091                   Metabolic      549     752  1301   
8   0.744960  0.000000    Hyper-/ hypophosphatemia        0       0     0   
9   0.826683  0.000000                Mitochondria        0       0     0   
10  0.765404  0.088950               Preconception        0       0     0   
11  0.770417  0.009618    Congenital heart defects        0       0     0   
12  0.799505  0.041254           Hereditary cancer        0       0     0   
13  0.795379  0.000000  Early onset cardiomyopathy        0       0     0   
14  0.736391  0.000000             Noonan syndrome        0       0     0   
15  0.721223  0.000000  Primary ciliary dyskinesia        0       0     0   
16  0.775805  0.000000     Developmental disorders        3      12    15   
17  0.756923  0.000000           Leukemia-Lymphoma        0       0     0   

   n_train  
0       27  
1       17  
2     1530  
3        0  
4        0  
5        0  
6        0  
7     1301  
8        0  
9        0  
10       0  
11       0  
12       0  
13       0  
14       0  
15       0  
16      15  
17       0  
The mean of the M-W analysis AUC: 0.7729381548688998

Dyslipid | Optimal hyper | unbalanced

Back

In [6]:
full_auc_analysis(
    curr_setup = 'Dyslipid, Optimal Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/random_hyper/dyslipid/unbalanced/dyslipid_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/dyslipid/unbalanced/dyslipid_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_dyslipid_unbalanced.csv',
    training_set_loc='./datafiles/dyslipid.txt.gz',
    model='./test_output/model_2_0/random_hyper/dyslipid/unbalanced/xgb_optimal_model.pickle.dat',
    filter_out='./datafiles/dyslipid.txt.gz'
)
Parameter learning_rate is set to 0.15442510030625153
Parameter n_estimators is set to 257
Parameter max_depth is set to 17
There are 5029 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.8428616243655226
AUC analysis of the testing dataset reveals AUC: 0.5908200773253944
File ./not_saving_directory/auc_analysis_randomhyper_dyslipid_unbalanced.csv found. Loading.
Top 10 worst performing genes: 
          gene       auc        f1    recall       fpr  precision  n_benign  \
2353        F2  0.000000  0.000000  0.000000  1.000000   0.000000         2   
1748  KIAA0556  0.000000  0.666667  1.000000  0.000000   0.500000         1   
899      PTHLH  0.000000  0.285714  1.000000  0.000000   0.166667         5   
240   KIAA0196  0.000000  0.000000  0.000000  1.000000   0.000000       306   
2320     AP1S2  0.000000  0.000000  0.000000  1.000000   0.000000         2   
2154      FAR1  0.000000  0.666667  1.000000  0.000000   0.500000         1   
1882      INTU  0.000000  0.400000  0.333333  0.666667   0.500000         1   
2344      UROS  0.000000  0.000000  0.000000  1.000000   0.000000         2   
2346  TRAF3IP1  0.000000  0.000000  0.000000  1.000000   0.000000         3   
657      ASCC1  0.031746  0.000000  0.000000  1.000000   0.000000        63   

      n_malign  n_tot  n_train  n_test  
2353         1      3        3       0  
1748         1      2        2       0  
899          1      6        4       2  
240          1    307      299       8  
2320         1      3        3       0  
2154         1      2        2       0  
1882         3      4        4       0  
2344         1      3        3       0  
2346         1      4        4       0  
657          1     64       56       8  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.529730  0.264865  0.738553        all               Neurogenetics   
1    0.457872  0.228936  0.781629        all                 Amyloidosis   
2    0.003024  0.998535  0.001512        all              Cardiovascular   
3    0.760430  0.624716  0.380215        all    Primary Immunodeficiency   
4    0.000247  0.000123  0.999883        all                        Skin   
5    0.076399  0.038200  0.962691        all                    Epilepsy   
6    0.479563  0.771064  0.239782        all                  Angioedema   
7    0.808476  0.404238  0.602702        all                   Metabolic   
8    0.376828  0.820980  0.188414        all    Hyper-/ hypophosphatemia   
9    0.501803  0.760218  0.250902        all                Mitochondria   
10   0.950667  0.532873  0.475333        all                   Fertility   
11   0.186873  0.093437  0.909960        all    Congenital heart defects   
12   0.017322  0.991537  0.008661        all           Hereditary cancer   
13   0.777329  0.624815  0.388664        all  Early onset cardiomyopathy   
14   0.147237  0.073618  0.931190        all             Noonan syndrome   
15   0.859704  0.583986  0.429852        all  Primary ciliary dyskinesia   
16   0.620668  0.310334  0.702032        all     Developmental disorders   
17   0.257967  0.128984  0.878306        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign n_tot  \
0   0.854051  0.077910               Neurogenetics      194       26   220   
1   0.848314  0.000000                 Amyloidosis       58        3    61   
2   0.934905  0.069896              Cardiovascular     3913     1116  5029   
3   0.883311  0.038777    Primary Immunodeficiency        0        0     0   
4   0.820833  0.021416                        Skin        0        0     0   
5   0.837761  0.082416                    Epilepsy        0        0     0   
6   0.906667  0.000000                  Angioedema        0        0     0   
7   0.869064  0.027803                   Metabolic     2670     1094  3764   
8   0.920916  0.000000    Hyper-/ hypophosphatemia        0        0     0   
9   0.906371  0.000000                Mitochondria        0        0     0   
10  0.844935  0.103762                   Fertility        0        0     0   
11  0.847166  0.010824    Congenital heart defects        0        0     0   
12  0.896182  0.046443           Hereditary cancer        0        0     0   
13  0.885663  0.000000  Early onset cardiomyopathy        0        0     0   
14  0.805300  0.000000             Noonan syndrome        0        0     0   
15  0.881660  0.000000  Primary ciliary dyskinesia        0        0     0   
16  0.854909  0.000000     Developmental disorders       40       24    64   
17  0.835067  0.000000           Leukemia-Lymphoma        0        0     0   

   n_train  
0      220  
1       61  
2     5029  
3        0  
4        0  
5        0  
6        0  
7     3764  
8        0  
9        0  
10       0  
11       0  
12       0  
13       0  
14       0  
15       0  
16      64  
17       0  
The mean of the M-W analysis AUC: 0.8685041310758512

Hereditary cancer | Optimal hyper | balanced

Back

In [8]:
full_auc_analysis(
    curr_setup = 'Hereditary cancer, Optimal Hyper, balanced ds',
    train_loc = './test_output/model_2_0/random_hyper/ek/balanced/ek_balanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/ek/balanced/ek_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_ek_balanced.csv',
    model='./test_output/model_2_0/random_hyper/ek/balanced/xgb_ransearch.pickle.dat',
    training_set_loc='./test_output/model_2_0/default_hyper/ek/balanced/train_balanced_dataset.tsv.gz',
    filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
Parameter learning_rate is set to 0.10027603273002453
Parameter n_estimators is set to 268
Parameter max_depth is set to 9
There are 2766 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.7931771487711897
AUC analysis of the testing dataset reveals AUC: 0.7264596223919082
File ./not_saving_directory/auc_analysis_randomhyper_ek_balanced.csv found. Loading.
Top 10 worst performing genes: 
         gene  auc        f1  recall  fpr  precision  n_benign  n_patho  \
2342      AVP  0.0  0.000000     0.0  1.0   0.000000         1        1   
2316    HPSE2  0.0  0.333333     1.0  0.0   0.200000         4        1   
894    SLC6A9  0.0  0.666667     1.0  0.0   0.500000         2        2   
487      PIGW  0.0  0.857143     1.0  0.0   0.750000         1        3   
1048    ERMAP  0.0  0.000000     0.0  1.0   0.000000        21        1   
1822  ANKRD17  0.0  0.666667     1.0  0.0   0.500000         1        1   
2146   MRPS34  0.0  0.800000     1.0  0.0   0.666667         1        2   
1977    CDK10  0.0  0.500000     0.5  0.5   0.500000         1        2   
1478    HINT1  0.0  0.285714     1.0  0.0   0.166667         5        1   
2277     EFHB  0.0  0.666667     1.0  0.0   0.500000         1        1   

      n_tot  n_train  n_test  
2342      2        2       0  
2316      5        5       0  
894       4        4       0  
487       4        4       0  
1048     22       18       4  
1822      2        1       1  
2146      3        3       0  
1977      3        3       0  
1478      6        5       1  
2277      2        2       0  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.522899  0.261449  0.741946        all               Neurogenetics   
1    0.547875  0.737708  0.273938        all                 Amyloidosis   
2    0.081064  0.040532  0.960298        all              Cardiovascular   
3    0.256245  0.874575  0.128123        all    Primary Immunodeficiency   
4    0.608293  0.700378  0.304146        all                        Skin   
5    0.060064  0.030032  0.970699        all                    Epilepsy   
6    0.147242  0.931187  0.073621        all                  Angioedema   
7    0.780783  0.616482  0.390392        all                   Metabolic   
8    0.288920  0.863427  0.144460        all    Hyper-/ hypophosphatemia   
9    0.243395  0.885298  0.121697        all                Mitochondria   
10   0.265434  0.871658  0.132717        all               Preconception   
11   0.143131  0.071566  0.931208        all    Congenital heart defects   
12   0.057044  0.972029  0.028522        all           Hereditary cancer   
13   0.777331  0.624814  0.388666        all  Early onset cardiomyopathy   
14   0.215997  0.107998  0.898419        all             Noonan syndrome   
15   0.416192  0.801891  0.208096        all  Primary ciliary dyskinesia   
16   0.671422  0.335711  0.677080        all     Developmental disorders   
17   0.157364  0.078682  0.926379        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho n_tot  \
0   0.842025  0.046191               Neurogenetics       41      62   103   
1   0.873710  0.000000                 Amyloidosis        0       0     0   
2   0.828633  0.073716              Cardiovascular      244     153   397   
3   0.867910  0.045406    Primary Immunodeficiency       60      75   135   
4   0.860627  0.028739                        Skin       66      24    90   
5   0.828062  0.048537                    Epilepsy        1       0     1   
6   0.920635  0.000000                  Angioedema        0       0     0   
7   0.834813  0.077155                   Metabolic      155     448   603   
8   0.895632  0.000000    Hyper-/ hypophosphatemia        0       0     0   
9   0.899918  0.000000                Mitochondria       21      67    88   
10  0.878041  0.013600               Preconception        0       0     0   
11  0.822005  0.018648    Congenital heart defects      305     192   497   
12  0.851914  0.093597           Hereditary cancer     1383    1383  2766   
13  0.865154  0.000000  Early onset cardiomyopathy      264     290   554   
14  0.794549  0.000000             Noonan syndrome      222     130   352   
15  0.879790  0.000000  Primary ciliary dyskinesia        0       0     0   
16  0.842375  0.000000     Developmental disorders      417     384   801   
17  0.759871  0.000000           Leukemia-Lymphoma       45      15    60   

   n_train  
0      103  
1        0  
2      397  
3      135  
4       90  
5        1  
6        0  
7      603  
8        0  
9       88  
10       0  
11     497  
12    2766  
13     554  
14     352  
15       0  
16     801  
17      60  
The mean of the M-W analysis AUC: 0.8525369442888027

Hereditary cancer | Optimal hyper | unbalanced

Back

In [8]:
full_auc_analysis(
    curr_setup = 'Hereditary cancer, Optimal Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/random_hyper/ek/unbalanced/ek_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/ek/unbalanced/ek_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_ek_unbalanced.csv',
    training_set_loc='./datafiles/hereditarycancer.txt.gz',
    model='./test_output/model_2_0/random_hyper/ek/unbalanced/xgb_optimal_model.pickle.dat',
    filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
Parameter learning_rate is set to 0.11829773260174394
Parameter n_estimators is set to 541
Parameter max_depth is set to 16
There are 26776 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.9229124513299454
AUC analysis of the testing dataset reveals AUC: 0.6842662092708873
File ./not_saving_directory/auc_analysis_randomhyper_ek_unbalanced.csv found. Loading.
Top 10 worst performing genes: 
          gene       auc        f1  recall  fpr  precision  n_benign  \
1776      SAA1  0.000000  0.000000     0.0  1.0   0.000000        61   
1995     PTHLH  0.000000  0.285714     1.0  0.0   0.166667         5   
2270     CDK10  0.000000  0.500000     0.5  0.5   0.500000         1   
2336  TRAF3IP1  0.000000  0.000000     0.0  1.0   0.000000         3   
246      ASCC1  0.031746  0.000000     0.0  1.0   0.000000        63   
1156    ABCA13  0.064103  0.026667     1.0  0.0   0.013514        78   
499    C12orf4  0.090909  0.086957     1.0  0.0   0.045455        22   
749      ERMAP  0.095238  0.000000     0.0  1.0   0.000000        21   
2017    UBQLN2  0.142857  0.000000     0.0  1.0   0.000000        49   
256   KIAA0196  0.179739  0.000000     0.0  1.0   0.000000       306   

      n_malign  n_tot  n_train  n_test  
1776         1     62       61       1  
1995         1      6        4       2  
2270         2      3        3       0  
2336         1      4        4       0  
246          1     64       56       8  
1156         1     79       51      28  
499          1     23       18       5  
749          1     22       18       4  
2017         1     50       50       0  
256          1    307      299       8  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.586013  0.293007  0.710585        all               Neurogenetics   
1    0.243395  0.121697  0.885298        all                 Amyloidosis   
2    0.202817  0.101408  0.900285        all              Cardiovascular   
3    0.124162  0.939490  0.062081        all    Primary Immunodeficiency   
4    0.140843  0.070421  0.931312        all                        Skin   
5    0.316113  0.158056  0.844545        all                    Epilepsy   
6    0.190893  0.910411  0.095446        all                  Angioedema   
7    0.878702  0.439351  0.567719        all                   Metabolic   
8    0.376834  0.820977  0.188417        all    Hyper-/ hypophosphatemia   
9    0.396218  0.811583  0.198109        all                Mitochondria   
10   0.664955  0.674979  0.332478        all                   Fertility   
11   0.223698  0.111849  0.892026        all    Congenital heart defects   
12   0.005729  0.997210  0.002864        all           Hereditary cancer   
13   0.723715  0.651306  0.361858        all  Early onset cardiomyopathy   
14   0.128500  0.064250  0.940074        all             Noonan syndrome   
15   0.322274  0.847353  0.161137        all  Primary ciliary dyskinesia   
16   0.524583  0.262292  0.749096        all     Developmental disorders   
17   0.215997  0.107998  0.898419        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign  n_tot  \
0   0.928803  0.035236               Neurogenetics     1090      573   1663   
1   0.894592  0.000000                 Amyloidosis        0        0      0   
2   0.922019  0.041362              Cardiovascular     1381      943   2324   
3   0.951070  0.042149    Primary Immunodeficiency     1411      590   2001   
4   0.918301  0.030269                        Skin      822       42    864   
5   0.925602  0.033792                    Epilepsy       12        2     14   
6   0.975556  0.000000                  Angioedema        0        0      0   
7   0.918704  0.053059                   Metabolic     2922     2215   5137   
8   0.961655  0.000000    Hyper-/ hypophosphatemia        0        0      0   
9   0.961443  0.000000                Mitochondria      411      255    666   
10  0.936082  0.050091                   Fertility        0        0      0   
11  0.913756  0.019465    Congenital heart defects     2324     1037   3361   
12  0.946918  0.043966           Hereditary cancer    16869     9907  26776   
13  0.948009  0.000000  Early onset cardiomyopathy     3682     3126   6808   
14  0.850671  0.000000             Noonan syndrome     1081      815   1896   
15  0.965118  0.000000  Primary ciliary dyskinesia        0        0      0   
16  0.922018  0.000000     Developmental disorders     3999     2044   6043   
17  0.886094  0.000000           Leukemia-Lymphoma      466       29    495   

   n_train  
0     1663  
1        0  
2     2324  
3     2001  
4      864  
5       14  
6        0  
7     5137  
8        0  
9      666  
10       0  
11    3361  
12   26776  
13    6808  
14    1896  
15       0  
16    6043  
17     495  
The mean of the M-W analysis AUC: 0.929245097033707

Neurogenetics | Optimal hyper | balanced

Back

In [9]:
full_auc_analysis(
    curr_setup = 'Neurogenetics, Optimal Hyper, balanced ds',
    train_loc = './test_output/model_2_0/random_hyper/neuro/balanced/neuro_balanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/neuro/balanced/neuro_balanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_neuro_balanced.csv',
    training_set_loc='./test_output/model_2_0/default_hyper/neuro/balanced/train_balanced_dataset.tsv.gz',
    model='./test_output/model_2_0/random_hyper/neuro/balanced/xgb_optimal_model.pickle.dat',
    filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
Parameter learning_rate is set to 0.0957402972203567
Parameter n_estimators is set to 352
Parameter max_depth is set to 10
There are 5804 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.8022959538708234
AUC analysis of the testing dataset reveals AUC: 0.7219954741583261
File ./not_saving_directory/auc_analysis_randomhyper_neuro_balanced.csv found. Loading.
Top 10 worst performing genes: 
          gene  auc        f1  recall  fpr  precision  n_benign  n_patho  \
2252      EFHB  0.0  0.000000     0.0  1.0   0.000000         1        1   
1453      FAR1  0.0  0.666667     1.0  0.0   0.500000         1        1   
41       ASCC1  0.0  0.030769     1.0  0.0   0.015625        63        1   
1563      ANK3  0.0  0.000000     0.0  1.0   0.000000       110        1   
2196     MPZL2  0.0  0.666667     1.0  0.0   0.500000         1        1   
2191  TRAF3IP1  0.0  0.000000     0.0  1.0   0.000000         3        1   
1337      SMPX  0.0  0.285714     1.0  0.0   0.166667         5        1   
783       PIGW  0.0  0.857143     1.0  0.0   0.750000         1        3   
960       SORD  0.0  0.500000     1.0  0.0   0.333333         2        1   
1220  HIST1H1E  0.0  0.285714     1.0  0.0   0.166667         5        1   

      n_tot  n_train  n_test  
2252      2        2       0  
1453      2        2       0  
41       64       56       8  
1563    111      106       5  
2196      2        2       0  
2191      4        4       0  
1337      6        5       1  
783       4        4       0  
960       3        2       1  
1220      6        5       1  
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.003974  0.998078  0.001987        all               Neurogenetics   
1    0.943637  0.542228  0.471818        all                 Amyloidosis   
2    0.379140  0.189570  0.813024        all              Cardiovascular   
3    0.240301  0.882428  0.120150        all    Primary Immunodeficiency   
4    0.932788  0.466394  0.538761        all                        Skin   
5    0.290664  0.857128  0.145332        all                    Epilepsy   
6    0.119853  0.059926  0.944166        all                  Angioedema   
7    0.794599  0.609608  0.397299        all                   Metabolic   
8    0.215997  0.898419  0.107998        all    Hyper-/ hypophosphatemia   
9    0.190893  0.910411  0.095446        all                Mitochondria   
10   0.256685  0.128342  0.875931        all               Preconception   
11   0.126986  0.063493  0.939035        all    Congenital heart defects   
12   0.094311  0.047155  0.953677        all           Hereditary cancer   
13   0.723715  0.651306  0.361858        all  Early onset cardiomyopathy   
14   0.190893  0.095446  0.910411        all             Noonan syndrome   
15   0.457878  0.228939  0.781626        all  Primary ciliary dyskinesia   
16   0.777331  0.388666  0.624814        all     Developmental disorders   
17   0.339858  0.169929  0.838863        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_patho n_tot  \
0   0.837062  0.152472               Neurogenetics     2902    2902  5804   
1   0.844931  0.000000                 Amyloidosis        0       0     0   
2   0.806944  0.102367              Cardiovascular       14      13    27   
3   0.858956  0.038750    Primary Immunodeficiency       74     249   323   
4   0.839146  0.017012                        Skin       33      55    88   
5   0.818947  0.129298                    Epilepsy      451    1125  1576   
6   0.616508  0.000000                  Angioedema        0       0     0   
7   0.827052  0.061176                   Metabolic     1190    2318  3508   
8   0.883552  0.000000    Hyper-/ hypophosphatemia        1      44    45   
9   0.897199  0.000000                Mitochondria      250     359   609   
10  0.802508  0.044056               Preconception        1      44    45   
11  0.811649  0.009450    Congenital heart defects      120      94   214   
12  0.822336  0.065266           Hereditary cancer       31     165   196   
13  0.853661  0.000000  Early onset cardiomyopathy      390     208   598   
14  0.776111  0.000000             Noonan syndrome        0       0     0   
15  0.822418  0.000000  Primary ciliary dyskinesia        0       0     0   
16  0.833347  0.000000     Developmental disorders     1445    2427  3872   
17  0.806957  0.000000           Leukemia-Lymphoma       12       5    17   

   n_train  
0     5804  
1        0  
2       27  
3      323  
4       88  
5     1576  
6        0  
7     3508  
8       45  
9      609  
10      45  
11     214  
12     196  
13     598  
14       0  
15       0  
16    3872  
17      17  
The mean of the M-W analysis AUC: 0.8199602452486267

Neurogenetics | Optimal hyper | unbalanced

Back

In [10]:
full_auc_analysis(
    curr_setup = 'Neurogenetics, Optimal Hyper, unbalanced ds',
    train_loc = './test_output/model_2_0/random_hyper/neuro/unbalanced/neuro_unbalanced_train.txt',
    test_loc = './test_output/model_2_0/random_hyper/neuro/unbalanced/neuro_unbalanced_test.txt',
    auc_analysis_name= 'auc_analysis_randomhyper_neuro_unbalanced.csv',
    training_set_loc='./datafiles/neurogenetics.txt.gz',
    model='./test_output/model_2_0/random_hyper/neuro/unbalanced/xgb_optimal_model.pickle.dat',
    filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
Parameter learning_rate is set to 0.09900765886072124
Parameter n_estimators is set to 248
Parameter max_depth is set to 18
There are 29769 samples in the training set.
AUC analysis of the training dataset reveals AUC: 0.9371396364066774
AUC analysis of the testing dataset reveals AUC: 0.6843054780134005
File ./not_saving_directory/auc_analysis_randomhyper_neuro_unbalanced.csv found. Loading.
Top 10 worst performing genes: 
          gene       auc        f1  recall  fpr  precision  n_benign  \
2242  TRAF3IP1  0.000000  0.000000     0.0  1.0   0.000000         3   
867      PTHLH  0.000000  0.285714     1.0  0.0   0.166667         5   
12       ASCC1  0.015873  0.000000     0.0  1.0   0.000000        63   
1822      SAA1  0.049180  0.000000     0.0  1.0   0.000000        61   
2153       TNC  0.070671  0.000000     0.0  1.0   0.000000       283   
632      ERMAP  0.095238  0.000000     0.0  1.0   0.000000        21   
1329     KITLG  0.142857  0.000000     0.0  1.0   0.000000        21   
1536       CA4  0.150327  0.000000     0.0  1.0   0.000000       153   
735   HIST1H1E  0.200000  0.285714     1.0  0.0   0.166667         5   
79        MSH3  0.278481  0.028986     1.0  0.0   0.014706        79   

      n_malign  n_tot  n_train  n_test  
2242         1      4        4       0  
867          1      6        4       2  
12           1     64       56       8  
1822         1     62       61       1  
2153         1    284      278       6  
632          1     22       18       4  
1329         1     22       22       0  
1536         1    154      153       1  
735          1      6        5       1  
79           1     80       62      18  
/home/rjsietsma/PycharmProjects/dsls_master_thesis/side_scripts/utilities.py:429: RuntimeWarning: Mean of empty slice.
  x_mean = x.mean()
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:233: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:194: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/rjsietsma/PycharmProjects/dsls_master_thesis/venv/lib/python3.8/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
UMCG genepanels Mann-Whitney analysis: 
    two-sided      less   greater category_x                 compared_to  \
0    0.351263  0.827061  0.175632        all               Neurogenetics   
1    0.203162  0.101581  0.904554        all                 Amyloidosis   
2    0.122904  0.061452  0.939706        all              Cardiovascular   
3    0.187852  0.908230  0.093926        all    Primary Immunodeficiency   
4    0.098064  0.049032  0.952271        all                        Skin   
5    0.191576  0.095788  0.906042        all                    Epilepsy   
6    0.457878  0.228939  0.781626        all                  Angioedema   
7    0.621479  0.310739  0.695573        all                   Metabolic   
8    0.416192  0.801891  0.208096        all    Hyper-/ hypophosphatemia   
9    0.547875  0.737708  0.273938        all                Mitochondria   
10   0.201030  0.903070  0.100515        all                   Fertility   
11   0.143131  0.071566  0.931208        all    Congenital heart defects   
12   0.002922  0.998579  0.001461        all           Hereditary cancer   
13   1.000000  0.514100  0.500000        all  Early onset cardiomyopathy   
14   0.128500  0.064250  0.940074        all             Noonan syndrome   
15   0.524583  0.749096  0.262292        all  Primary ciliary dyskinesia   
16   0.501809  0.250904  0.760216        all     Developmental disorders   
17   0.243395  0.121697  0.885298        all           Leukemia-Lymphoma   

        mean       std                  category_y n_benign n_malign  n_tot  \
0   0.941461  0.058496               Neurogenetics    23659     6110  29769   
1   0.905180  0.000000                 Amyloidosis        0        0      0   
2   0.913517  0.068602              Cardiovascular      241       30    271   
3   0.957886  0.033934    Primary Immunodeficiency     1593      725   2318   
4   0.928055  0.022832                        Skin      283      100    383   
5   0.928200  0.042790                    Epilepsy     5563     2235   7798   
6   0.927302  0.000000                  Angioedema        0        0      0   
7   0.927154  0.048202                   Metabolic    13038     4790  17828   
8   0.972649  0.000000    Hyper-/ hypophosphatemia       24       76    100   
9   0.963528  0.000000                Mitochondria     2435      736   3171   
10  0.968725  0.014805                   Fertility       24       76    100   
11  0.920330  0.019145    Congenital heart defects     1060      246   1306   
12  0.962639  0.030019           Hereditary cancer     1090      573   1663   
13  0.947836  0.000000  Early onset cardiomyopathy     2162      406   2568   
14  0.857861  0.000000             Noonan syndrome        0        0      0   
15  0.964875  0.000000  Primary ciliary dyskinesia        0        0      0   
16  0.928054  0.000000     Developmental disorders    13065     4876  17941   
17  0.909374  0.000000           Leukemia-Lymphoma       49        8     57   

   n_train  
0    29769  
1        0  
2      271  
3     2318  
4      383  
5     7798  
6        0  
7    17828  
8      100  
9     3171  
10     100  
11    1306  
12    1663  
13    2568  
14       0  
15       0  
16   17941  
17      57  
The mean of the M-W analysis AUC: 0.9347013994863806

AUC vs n_samples analysis

Plots

Index

In [4]:
# Subprocess calling of the incrementing model creations.

location = os.path.join(os.path.abspath('.'))
types = ['balanced', 'unbalanced']
categories = ['cardio', 'dyslipid', 'hc', 'neuro']
unbalanced_train_loc = {
    "cardio": 'cardiovascular.txt.gz',
    'dyslipid': 'dyslipid.txt.gz',
    'hc': 'hereditarycancer.txt.gz',
    'neuro': 'neurogenetics.txt.gz'
}
specified_defaults = {
    'cardio':{
        'balanced': os.path.join(location, 'not_saving_directory', 'cardiovascular_balanced.json'),
        'unbalanced': os.path.join(location, 'not_saving_directory', 'cardiovascular_unbalanced.json')
    },
    'dyslipid':{
        'balanced': os.path.join(location, 'not_saving_directory', 'dyslipid_balanced.json'),
        'unbalanced': os.path.join(location, 'not_saving_directory', 'dyslipid_unbalanced.json')   
    },
    'hc':{
        'balanced': os.path.join(location, 'not_saving_directory', 'ek_balanced.json'),
        'unbalanced': os.path.join(location, 'not_saving_directory', 'ek_unbalanced.json')
    },
    'neuro':{
        'balanced': os.path.join(location, 'not_saving_directory', 'neuro_balanced.json'),
        'unbalanced': os.path.join(location, 'not_saving_directory', 'neuro_unbalanced.json')
    }
}
attempts = ['attempt_2', 'attempt_3']
levels = np.arange(10,100, 10)
level = np.round_(np.arange(0.1,1,0.1), decimals=2)
for tipe in types:
    for category in categories:
        specified_d = specified_defaults[category][tipe]
        if tipe == 'balanced':
            input_loc = os.path.join(location, 'output_incrementing_models', tipe, category, 'train_balanced_dataset.tsv.gz')
        else:
            input_loc = os.path.join(location, 'datafiles', unbalanced_train_loc[category])
        for i in range(9):
            folder = str(levels[i])
            percentage = str(level[i])
            for attempt in attempts:
                output_loc = os.path.join(location, 'output_incrementing_models', tipe, category, folder, attempt)
                if not os.path.isfile(os.path.join(output_loc, 'splitted_test_dataset.tsv.gz')):
                    command = f'python3 /home/rjsietsma/PycharmProjects/train_capice_model/train_model.py -b {input_loc} -o {output_loc} -s {percentage} -v -d -sd {specified_d}'
                    print(f"Calling: {command}")
                    subprocess.call(command.split(' '))
In [5]:
for path in Path('./').rglob('attempt_*'):
    if not os.path.isfile(os.path.join(path, 'test_results.txt')):
        path = os.path.abspath(path)
        input_file = os.path.join(path, 'splitted_test_dataset.tsv.gz')
        input_model = os.path.join(path, 'xgb_ransearch.pickle.dat')
        output_path = os.path.join(path, 'test_results.txt')
        dev_null = Path('/dev/null')
        command = f"bash /home/rjsietsma/PycharmProjects/capice/predict.sh {input_file} {input_model} {output_path} {dev_null}"
        print(f"Calling command: {command} \n")
        subprocess.call(command.split(' '))
    
In [6]:
def can_be_converted(entry):
    return_value = False
    try:
        entry['chr'] = entry['chr'].astype(np.float64)
        return_value = True
    except ValueError:
        return_value = False
    return return_value
In [7]:
incrementing_auc_df_balanced = pd.DataFrame(columns=['panel', 'train_size', 'auc', 'stdev'])
incrementing_auc_df_unbalanced = pd.DataFrame(columns=['panel', 'train_size', 'auc', 'stdev'])
location_output = 'output_incrementing_models'
attempts = ['attempt_1', 'attempt_2', 'attempt_3']

time_bfl = time.time()
for tipe in types:
    for category in categories:
        for i in range(9):
            time_ifl = time.time()
            folder = str(levels[i])
            percentage_train = 1 - level[i]
            if time_ifl - time_bfl > 30:
                print("Still processing, currently on:")
                print(f"Type: {tipe}")
                print(f"Category: {category}")
                print(f"Folder: {folder}")
                print("\n")
                time_bfl = time.time()
            aucs = []
            sizes = []
            for j, attempt in enumerate(attempts):
                path_to_file = os.path.join(location_output, tipe, category, folder, attempt)
                test_results = read_capice_output(os.path.join(path_to_file, 'test_results.txt'))
                test_input = pd.read_csv(os.path.join(path_to_file, 'splitted_test_dataset.tsv.gz'), sep='\t', low_memory=False, usecols=['#Chrom', 'Pos', 'Ref', 'Alt', 'binarized_label'])
                train_input = pd.read_csv(os.path.join(path_to_file, 'splitted_train_dataset.tsv.gz'), sep='\t', low_memory=False)
                test_input.rename(
                    columns={'#Chrom': 'chr',
                             'Pos': 'pos',
                             'Ref': 'ref',
                             'Alt': 'alt'},
                    inplace=True
                )
                if can_be_converted(test_results):
                    test_results['chr'] = test_results['chr'].astype(np.float64)
                if test_results['chr'].dtype == np.float64:
                    test_results['chr'] = test_results['chr'].astype(np.int64)
                if test_results['chr'].dtype == np.int64:
                    test_results['chr'] = test_results['chr'].astype(np.object)
                test_input['chr'] = test_input['chr'].astype(np.object)
                test_input['pos'] = test_input['pos'].astype(np.int64)
                merge = test_results.merge(test_input, on=['chr','pos','ref','alt'])
                for gene in merge['GeneName'].unique():
                    subset = merge[merge['GeneName'] == gene]
                    y_pred = np.array(merge['probabilities'])
                    y_true = np.array(merge['binarized_label'])
                    if np.unique(y_true).size > 1:
                        aucs.append(roc_auc_score(y_true=y_true, y_score=y_pred))
                        sizes.append(train_input.shape[0])
                    else:
                        print(f"Category: {category}, folder: {folder}, attempt: {attempt}, gene {gene} does not have enough datapoints for AUC calculations.")
                        continue
            aucs = np.array(aucs)
            sizes = np.array(sizes)
            add_df = pd.DataFrame({
                'panel': category,
                'train_size': sizes.mean(),
                'auc': aucs.mean(),
                'stdev': aucs.std()
            }, index=[0])
            if tipe == 'balanced':
                incrementing_auc_df_balanced = incrementing_auc_df_balanced.append(add_df, ignore_index=True)
            else:
                incrementing_auc_df_unbalanced = incrementing_auc_df_unbalanced.append(add_df, ignore_index=True)
Still processing, currently on:
Type: balanced
Category: neuro
Folder: 60


Still processing, currently on:
Type: unbalanced
Category: cardio
Folder: 50


Still processing, currently on:
Type: unbalanced
Category: hc
Folder: 10


Still processing, currently on:
Type: unbalanced
Category: neuro
Folder: 20


Still processing, currently on:
Type: unbalanced
Category: neuro
Folder: 70


Plots

Back

In [8]:
def func(x, a, b, c):
    return a*np.log(b*x) + c
In [34]:
incrementing_auc_df_balanced
Out[34]:
panel train_size auc stdev color
0 cardio 7466.0 0.961399 2.220446e-16 #009E73
1 cardio 6636.0 0.948956 3.330669e-16 #009E73
2 cardio 5807.0 0.945661 0.000000e+00 #009E73
3 cardio 4977.0 0.940869 2.220446e-16 #009E73
4 cardio 4148.0 0.933722 3.156790e-04 #009E73
5 cardio 3318.0 0.925220 1.110223e-16 #009E73
6 cardio 2488.0 0.913168 3.330669e-16 #009E73
7 cardio 1659.0 0.901544 0.000000e+00 #009E73
8 cardio 829.0 0.772909 1.110223e-16 #009E73
9 dyslipid 1377.0 0.991068 1.214543e-03 #F0E442
10 dyslipid 1224.0 0.989309 6.047783e-05 #F0E442
11 dyslipid 1071.0 0.974086 4.488370e-06 #F0E442
12 dyslipid 918.0 0.978665 3.330669e-16 #F0E442
13 dyslipid 765.0 0.982815 4.440892e-16 #F0E442
14 dyslipid 612.0 0.972700 6.712614e-06 #F0E442
15 dyslipid 459.0 0.975562 3.287867e-04 #F0E442
16 dyslipid 306.0 0.958455 2.220446e-16 #F0E442
17 dyslipid 153.0 0.961800 5.551115e-16 #F0E442
18 hc 3097.0 0.939194 1.590112e-04 #E69F00
19 hc 2753.0 0.929592 6.434743e-04 #E69F00
20 hc 2409.0 0.929827 2.109552e-03 #E69F00
21 hc 2065.0 0.924337 1.017026e-03 #E69F00
22 hc 1721.0 0.919613 9.077497e-04 #E69F00
23 hc 1376.0 0.919914 1.517469e-04 #E69F00
24 hc 1032.0 0.898572 3.901292e-03 #E69F00
25 hc 688.0 0.883858 2.899220e-03 #E69F00
26 hc 344.0 0.845710 8.106001e-03 #E69F00
27 neuro 6476.0 0.939298 4.440892e-16 #0072B2
28 neuro 5756.0 0.934738 2.220446e-16 #0072B2
29 neuro 5037.0 0.927143 2.220446e-16 #0072B2
30 neuro 4317.0 0.929010 3.330669e-16 #0072B2
31 neuro 3598.0 0.926938 4.564708e-04 #0072B2
32 neuro 2878.0 0.922623 3.330669e-16 #0072B2
33 neuro 2158.0 0.910552 6.094115e-05 #0072B2
34 neuro 1439.0 0.895915 2.662800e-05 #0072B2
35 neuro 719.0 0.870842 2.565897e-03 #0072B2
In [32]:
basepanel_performance = {
    'cardio': 0.746953,
    'hc': 0.854261,
    'neuro': 0.869192
}

from bokeh.palettes import Colorblind
import random

categories = incrementing_auc_df_balanced['panel'].unique()
# pallette = magma(categories.size)
# pallette = ('#000000', '#FF00F0', '#00FFFF', '#DC5039')
blind = list(Colorblind[4])
print(blind)
pallette = blind[:]
random.shuffle(pallette)
print(pallette)
colormap = dict(zip(categories, pallette))
incrementing_auc_df_balanced['color'] = incrementing_auc_df_balanced['panel'].map(colormap)
x_axis = np.linspace(0, incrementing_auc_df_balanced['train_size'].max(), int(incrementing_auc_df_balanced['train_size'].max()), endpoint=True)
plt.figure(figsize=(9,7))
for panel in incrementing_auc_df_balanced['panel'].unique():
    subset = incrementing_auc_df_balanced[incrementing_auc_df_balanced['panel'] == panel]
    color = subset['color'].unique()[0]
    plt.errorbar(
        x=subset['train_size'],
        y=subset['auc'],
        yerr=subset['stdev'],
        c=color,
        fmt='o',
        capsize=0,
        elinewidth=3,
        ls=None
    )
    x = np.array(subset['train_size'])
    y = np.array(subset['auc'])
    popt, pcov = curve_fit(func, x, y)
    popt = np.round(popt, decimals=4)
    formula = f"{popt[0]}*log({popt[1]} * x) + {popt[2]}"
    y_exp = func(x_axis, *popt)
    plt.plot(x_axis, y_exp, c=color, label=f"{panel} (y={formula})")
    if panel in basepanel_performance.keys():
        auc = basepanel_performance[panel]
        intercept_df = pd.DataFrame(
            {
                "x_value": x_axis,
                "y_exp": y_exp
            })

        intercept = math.floor(intercept_df.iloc[(intercept_df['y_exp'] - auc).abs().argsort()[:1]]['x_value'].values[0])
        plt.plot(x_axis, np.full(x_axis.shape, auc), '--', c=color,
                label=f"Base of {panel} (AUC={auc})")
        plt.scatter(intercept, auc, s=200, marker='*', c=color,
                    label=f"Intercept of {panel}: x={intercept}, y={auc}")
plt.legend(loc='upper right', bbox_to_anchor=(1.6,1.025))
plt.xlabel('Train dataset size')
plt.ylabel('AUC')
plt.ylim((0.5,1.01))
plt.title('Incrementing AUC analysis of balanced models.')
plt.show()
['#0072B2', '#E69F00', '#F0E442', '#009E73']
['#009E73', '#F0E442', '#E69F00', '#0072B2']
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-8-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c
In [96]:
basepanel_performance = {
    'cardio': 0.659012,
    'hc': 0.638193,
    'neuro': 0.767613
}

categories = incrementing_auc_df_unbalanced['panel'].unique()
pallette = viridis(categories.size)
colormap = dict(zip(categories, pallette))
incrementing_auc_df_unbalanced['color'] = incrementing_auc_df_unbalanced['panel'].map(colormap)
x_axis = np.linspace(0, incrementing_auc_df_unbalanced['train_size'].max(), int(incrementing_auc_df_unbalanced['train_size'].max()), endpoint=True)
plt.figure(figsize=(9,7))
for panel in incrementing_auc_df_unbalanced['panel'].unique():
    subset = incrementing_auc_df_unbalanced[incrementing_auc_df_unbalanced['panel'] == panel]
    color = subset['color'].unique()[0]
    plt.errorbar(
        x=subset['train_size'],
        y=subset['auc'],
        yerr=subset['stdev'],
        c=color,
        fmt='o',
        capsize=0,
        elinewidth=3,
        ls=None
    )
    x = np.array(subset['train_size'])
    y = np.array(subset['auc'])
    popt, pcov = curve_fit(func, x, y)
    popt = np.round(popt, decimals=4)
    formula = f"{popt[0]}*log({popt[1]} * x) + {popt[2]}"
    y_exp = func(x_axis, *popt)
    plt.plot(x_axis, y_exp, c=color, label=f"{panel} (y={formula})")
    if panel in basepanel_performance.keys():
        auc = basepanel_performance[panel]
        intercept_df = pd.DataFrame(
            {
                "x_value": x_axis,
                "y_exp": y_exp
            })

        intercept = math.floor(intercept_df.iloc[(intercept_df['y_exp'] - auc).abs().argsort()[:1]]['x_value'].values[0])
        plt.plot(x_axis, np.full(x_axis.shape, auc), '--', c=color,
                label=f"Base of {panel} (AUC={auc})")
        plt.scatter(intercept, auc, s=200, marker='*', c=color,
                    label=f"Intercept of {panel}: x={intercept}, y={auc}")
plt.legend(loc='upper right', bbox_to_anchor=(1.6,1.025))
plt.xlabel('Train dataset size')
plt.ylabel('AUC')
plt.title('Incrementing AUC analysis of unbalanced models.')
plt.show()
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: invalid value encountered in log
  return a*np.log(b*x) + c
<ipython-input-7-3589de1fe55e>:2: RuntimeWarning: divide by zero encountered in log
  return a*np.log(b*x) + c

Barplot of type of variants for each panel

Index

In [3]:
variants = pd.read_csv('./datafiles/train.txt.gz', compression='gzip', sep='\t', low_memory=False)
variants
Out[3]:
#Chrom Allergy/Immunology/Infectious Alt AnnoType Audiologic/Otolaryngologic Biochemical CCDS CDSpos Cardiovascular ConsDetail ... revel sift source tOverlapMotifs targetScan to_be_deleted verPhCons verPhyloP inTest sample_weight
0 14 False G CodingTranscript False False CCDS9787.1 806.0 False frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.843 False 1.0
1 20 False T CodingTranscript True False CCDS13112.1 1899.0 True frameshift,stop_gained ... NaN NaN vkgl NaN NaN False 1.000 4.670 False 1.0
2 20 False C CodingTranscript True False CCDS13112.1 2118.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 5.043 False 1.0
3 20 False A CodingTranscript True False CCDS13112.1 1586.0 True frameshift ... NaN NaN vkgl NaN NaN False 1.000 6.221 False 1.0
4 20 False A Intergenic True False NaN NaN True downstream ... NaN NaN vkgl NaN NaN False 1.000 6.368 False 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334596 17 False A CodingTranscript False False CCDS32642.1 1563.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 6.031 False 0.8
334597 17 False T CodingTranscript False False CCDS32642.1 2029.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 4.100 False 0.8
334598 10 False T CodingTranscript False False CCDS7431.1 1216.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 5.852 False 0.8
334599 2 False T CodingTranscript False False CCDS2382.1 2998.0 False stop_gained ... NaN NaN unknown NaN NaN False 0.031 2.213 False 0.8
334600 5 False T CodingTranscript False False CCDS3952.1 1221.0 False stop_gained ... NaN NaN unknown NaN NaN False 1.000 0.528 False 0.8

334601 rows × 152 columns

In [4]:
ax = variants['Consequence'].value_counts().plot(kind='bar')
ax.set_title('Type of SNV over the entire training dataset.')
ax.set_ylabel('Count')
ax.set_xlabel('Consequence')
Out[4]:
Text(0.5, 0, 'Consequence')
In [5]:
# Reduce the information to just panels.

panels = {}
for panel in genepanels.keys():
    panels[panel] = []
    disease_panel = genepanels[panel]
    if isinstance(disease_panel, dict):
        for disease, genes in disease_panel.items():
            for gene in genes:
                if gene not in panels[panel]:
                    panels[panel].append(gene)
In [6]:
for panel, genes in panels.items():
    title = f'Type of SNV over the entire training dataset, panel: {panel}'
    subset = variants[variants['GeneName'].isin(genes)]
    data = subset['Consequence'].value_counts()
    ax = data.plot.bar(x='index', y='Consequence',
                      colormap='viridis',
                      title=title)
    ax.set_ylabel('Count')
    ax.set_xlabel('Panel')
    fig = plt.Figure()
    fig.axes.append(ax)
    plt.show()